In [None]:
from collections import Counter
from nltk.util import ngrams
import matplotlib.pyplot as plt
import pandas as pd
import math

cipher_text = ""
with open('cipher.txt', 'r') as ct_file:
  cipher_text = ct_file.read().replace("\n", "")

In [None]:
print(cipher_text)

In [None]:
unigram_cipher = list(cipher_text)

In [None]:
unigram_counter = Counter()
for letter in unigram_cipher:
    unigram_counter[letter] += 1

The alphabet has **24 symbols**.

In [None]:
alphabet_list = list(unigram_counter)
alphabet_list.sort()
print(alphabet_list)

The **top-5 most common unigrams** are E, T, O, S and A. This suggests that the language of the plaintext might be **English**.

In [None]:
unigram_counter.most_common(5)

In [None]:
unigram_df = pd.DataFrame.from_dict(unigram_counter, orient='index', columns=["unigram_count"]).sort_index()

In [None]:
# Columns will be relative frequencies of the letters from the ciphertext and English language.
compare_unigram_df = pd.DataFrame()
# Read English letters counts from "War and Peace"
eng_unigram_df = pd.read_csv("eng_unigrams.txt", sep=" ", names=["unigram", "unigram_count"], index_col="unigram").sort_index()
compare_unigram_df["eng_rel_freq_perc"] = eng_unigram_df.unigram_count.apply(lambda x: (x / eng_unigram_df.unigram_count.sum()) * 100.0)
compare_unigram_df["cipher_rel_freq_perc"] = unigram_df.unigram_count.apply(lambda x: (x / unigram_df.unigram_count.sum()) * 100.0)

Since the ciphertext is quite long, the relative frequency distribution of unigrams strongly suggests that the language is English. 

In [None]:
compare_unigram_df.rename(columns={"eng_rel_freq_perc": "English", "cipher_rel_freq_perc": "Ciphertext"}, inplace=True)
ax = compare_unigram_df.plot(kind="bar", secondary_y="eng_rel_freq_perc", figsize=(15,5), rot=0)
ax.set_title("Relative frequency distribution of unigrams - English vs Ciphertext")
ax.set_ylabel("Percentage")
ax.xaxis.label.set_visible(False)

Another tool that might help us **determine the degree of masking** caused by the encryption is the relative frequency distribution of each letter of the alphabet against the most common letter found earlier, E.

In [None]:
unigram_df["ranked_freq_perc"] = unigram_df.unigram_count.apply(lambda x: (x / unigram_df.unigram_count.loc["E"]) * 100.0)
unigram_df.sort_values(by="ranked_freq_perc", ascending=False, inplace=True)
ax = unigram_df.plot(y="ranked_freq_perc", legend=None)
ax.set_title("Characteristic frequency distribution")
ax.set_ylabel("Percentage")
ax.set_xlabel("Frequency ranked letters")
# Override the x-axis labels inferred by pandas
ax.set_xticks(range(unigram_df.shape[0]))
ax.set_xticklabels(unigram_df.index.values)
plt.show()

Now let's check how many unique bigrams are in the ciphertext. There are **344 unique bigrams**.

In [None]:
bigram_cipher_list = list(ngrams(cipher_text, 2))
unique_bigrams_set = set(bigram_cipher_list)
len(unique_bigrams_set)

In [None]:
bigram_counter = Counter()
for bigram in unique_bigrams_set:
    for chunk in bigram_cipher_list:
        if bigram == chunk:
            bigram_str = "".join(bigram)
            bigram_counter[bigram_str] += 1

The **top-10 most common bigrams** are TE, TA, ET, EO, SE, AE, OE, NT, ER, RS.

In [None]:
bigram_counter.most_common(10)

Let's also check the frequency distribution of bigrams. To improve readability, we show only bigrams having **relative frequency higher than 1%**

In [None]:
bigram_df = pd.DataFrame.from_dict(bigram_counter, orient='index', columns=["bigram_count"]).sort_index()
bigram_df["rel_freq_perc"] = bigram_df.bigram_count.apply(lambda x: (x / bigram_df.bigram_count.sum()) * 100.0)
# Too many bigrams, filter out those having relative frequency lower than 1%
high_freq_bigram_df = bigram_df[bigram_df.rel_freq_perc > 1]
ax = high_freq_bigram_df.plot(kind="bar", y="rel_freq_perc", legend=None, rot=0)
ax.set_title("Relative frequency distribution of bigrams")
ax.set_ylabel("Percentage")
plt.show()

Now let's check how many unique trigrams are in the ciphertext. There are **953 unique trigrams**.

In [None]:
trigram_cipher_list = list(ngrams(cipher_text, 3))
unique_trigrams_set = set(trigram_cipher_list)
len(unique_trigrams_set)

In [None]:
trigram_counter = Counter()
for trigram in unique_trigrams_set:
    for chunk in trigram_cipher_list:
        if trigram == chunk:
            trigram_str = "".join(trigram)
            trigram_counter[trigram_str] += 1

The **top-10 most common trigrams** are NTE, ERO, EOE, ETE, TEA, TEE, TET, OER, OTE, TAE.

In [None]:
trigram_counter.most_common(20)

In [None]:
quadgram_cipher_list = list(ngrams(cipher_text, 4))
unique_quadgrams_set = set(quadgram_cipher_list)
quadgram_counter = Counter()
for quadgram in unique_quadgrams_set:
    for chunk in quadgram_cipher_list:
        if quadgram == chunk:
            quadgram_str = "".join(quadgram)
            quadgram_counter[quadgram_str] += 1