In [1]:
import math
import nltk
import requests
import numpy as np
import pandas as pd
import custom_lemmatizer
from scipy.stats import binom

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gulce\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gulce\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gulce\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gulce\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\gulce\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
with open('Fyodor Dostoyevski Processed.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
# Tokenize the text with the nltk library
tokens = nltk.word_tokenize(text)

In [5]:
print('Number of tokens:', len(tokens))

Number of tokens: 1425758


In [6]:
print(tokens[:100])

['part', 'i', 'chapter', 'i', 'on', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'july', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 's.', 'place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'k.', 'bridge', '.', 'he', 'had', 'successfully', 'avoided', 'meeting', 'his', 'landlady', 'on', 'the', 'staircase', '.', 'his', 'garret', 'was', 'under', 'the', 'roof', 'of', 'a', 'high', ',', 'five-storied', 'house', 'and', 'was', 'more', 'like', 'a', 'cupboard', 'than', 'a', 'room', '.', 'the', 'landlady', 'who', 'provided', 'him', 'with', 'garret', ',', 'dinners', ',', 'and', 'attendance', ',', 'lived', 'on', 'the', 'floor', 'below', ',', 'and', 'every', 'time', 'he', 'went', 'out', 'he', 'was']


In [7]:
# Find POS (part-of-speech) tags of the tokens with the nltk library
pos_tags = nltk.pos_tag(tokens, tagset='universal')

In [8]:
print(pos_tags[:100])

[('part', 'NOUN'), ('i', 'VERB'), ('chapter', 'NOUN'), ('i', 'NOUN'), ('on', 'ADP'), ('an', 'DET'), ('exceptionally', 'ADV'), ('hot', 'ADJ'), ('evening', 'VERB'), ('early', 'ADJ'), ('in', 'ADP'), ('july', 'NOUN'), ('a', 'DET'), ('young', 'ADJ'), ('man', 'NOUN'), ('came', 'VERB'), ('out', 'ADP'), ('of', 'ADP'), ('the', 'DET'), ('garret', 'NOUN'), ('in', 'ADP'), ('which', 'DET'), ('he', 'PRON'), ('lodged', 'VERB'), ('in', 'ADP'), ('s.', 'ADJ'), ('place', 'NOUN'), ('and', 'CONJ'), ('walked', 'VERB'), ('slowly', 'ADV'), (',', '.'), ('as', 'ADP'), ('though', 'ADP'), ('in', 'ADP'), ('hesitation', 'NOUN'), (',', '.'), ('towards', 'NOUN'), ('k.', 'VERB'), ('bridge', 'NOUN'), ('.', '.'), ('he', 'PRON'), ('had', 'VERB'), ('successfully', 'ADV'), ('avoided', 'VERB'), ('meeting', 'VERB'), ('his', 'PRON'), ('landlady', 'NOUN'), ('on', 'ADP'), ('the', 'DET'), ('staircase', 'NOUN'), ('.', '.'), ('his', 'PRON'), ('garret', 'NOUN'), ('was', 'VERB'), ('under', 'ADP'), ('the', 'DET'), ('roof', 'NOUN'), (

In [9]:
# Lemmatize the tokens using the WordNetLemmatizer in nltk with custom_lemmatizer class
cm = custom_lemmatizer.custom_lemmatizer()
lemmatized_tokens = [cm.lemmatize(pt) for pt in pos_tags]

In [10]:
print(lemmatized_tokens[:100])

['part', 'i', 'chapter', 'i', 'on', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'july', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 's.', 'place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'k.', 'bridge', '.', 'he', 'had', 'successfully', 'avoided', 'meeting', 'his', 'landlady', 'on', 'the', 'staircase', '.', 'his', 'garret', 'was', 'under', 'the', 'roof', 'of', 'a', 'high', ',', 'five-storied', 'house', 'and', 'was', 'more', 'like', 'a', 'cupboard', 'than', 'a', 'room', '.', 'the', 'landlady', 'who', 'provided', 'him', 'with', 'garret', ',', 'dinner', ',', 'and', 'attendance', ',', 'lived', 'on', 'the', 'floor', 'below', ',', 'and', 'every', 'time', 'he', 'went', 'out', 'he', 'was']


In [11]:
def calculate_unigram_frequencies(tokens):
    # Calculate the frequencies of all the unigrams
    unigram_frequencies = {}
    for t in tokens:
        if t in unigram_frequencies:
            unigram_frequencies[t] += 1
        else:
            unigram_frequencies[t] = 1
    return unigram_frequencies

In [12]:
unigram_frequencies = calculate_unigram_frequencies(lemmatized_tokens)
that_count = unigram_frequencies['that']
the_count = unigram_frequencies['the']
abject_count = unigram_frequencies['abject']
london_count = unigram_frequencies['london']
dot_count = unigram_frequencies['.']

In [13]:
print("Count of word 'that':", that_count)
print("Count of word 'the':", the_count)
print("Count of word 'abject':", abject_count)
print("Count of word 'london':", london_count)
print("Count of word '.':", dot_count)

Count of word 'that': 19429
Count of word 'the': 48392
Count of word 'abject': 21
Count of word 'london': 2
Count of word '.': 51738


In [14]:
# Get the stopwords list from https://gist.github.com/sebleier/554280
response = requests.get("https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords")
stopwords_list = response.text.split('\n')

In [15]:
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [16]:
def calculate_bigram_frequencies(tokens, window_size):
    # Create all bigrams according to the given window size
    bigrams = []
    for i in range(len(tokens)):
        for j in range(1, window_size+1):
            if i+j < len(tokens):
                bigrams.append((tokens[i], tokens[i+j]))
    # Calculate the frequencies of all the bigrams
    bigram_frequencies = {}
    for b in bigrams:
        if b in bigram_frequencies:
            bigram_frequencies[b] += 1
        else:
            bigram_frequencies[b] = 1
    all_bigrams = []
    for b, f in bigram_frequencies.items():
        all_bigrams.append(b)
    return all_bigrams, bigram_frequencies

In [17]:
def find_collocation_candidates(bigram_frequencies, pos_tags, min_freq):
    # Find the collocation candidates according to the required conditions
    collocation_candidates = []
    for b, f in bigram_frequencies.items():
        # Check whether the bigram occurs more than the desired minimum frequency and ignore if it is less
        if f < min_freq:
            continue
        # Check whether the bigram contains punctuation marks and ignore if it contains
        word1, word2 = b
        if not (word1.isalpha() and word2.isalpha()):
            continue
        # Check whether the bigram contains stopwords and ignore if it contains
        if word1 in stopwords_list or word2 in stopwords_list:
            continue
        # Check whether the bigram is ADJ-NOUN or NOUN-NOUN form and ignore if it is not
        pos1 = [t for w, t in pos_tags if w == word1]
        pos2 = [t for w, t in pos_tags if w == word2]
        if not(("ADJ" in pos1 and "NOUN" in pos2) or ("NOUN" in pos1 and "NOUN" in pos2)):
            continue
        # Add the bigram to collocation candidates list if it satisfies all the conditions
        collocation_candidates.append(b)
    return collocation_candidates

In [18]:
unigram_frequencies = calculate_unigram_frequencies(lemmatized_tokens)

In [19]:
all_bigrams_1, bigram_frequencies_1 = calculate_bigram_frequencies(lemmatized_tokens, 1)
collocation_candidates_1 = find_collocation_candidates(bigram_frequencies_1, pos_tags, 10)
N_1 = sum(bigram_frequencies_1.values())

In [20]:
magnificent_capital_count = bigram_frequencies_1[('magnificent', 'capital')]
print("Count of the bigram 'magnificent capital' with windows of size 1:", magnificent_capital_count)

Count of the bigram 'magnificent capital' with windows of size 1: 1


In [21]:
if ("mr.", "skimpole") in collocation_candidates_1:
    print("Yes")
else:
    print("No")

No


In [22]:
all_bigrams_3, bigram_frequencies_3 = calculate_bigram_frequencies(lemmatized_tokens, 3)
collocation_candidates_3 = find_collocation_candidates(bigram_frequencies_3, pos_tags, 10)
N_3 = sum(bigram_frequencies_3.values())

In [23]:
bright_fire_count = bigram_frequencies_3[('bright', 'fire')]
print("Count of the bigram 'bright fire' with windows of size 3:", bright_fire_count)

Count of the bigram 'bright fire' with windows of size 3: 1


In [24]:
if ("spontaneous", "combustion") in collocation_candidates_3:
    print("Yes")
else:
    print("No")

No


In [25]:
def calculate_t_score(c_bigram, c_word1, c_word2, N):
    real_mean = c_bigram / N # p (MLE)
    expected_mean = (c_word1 / N) * (c_word2 / N) # H0
    variance = real_mean # for small p
    t_score = (real_mean - expected_mean) / math.sqrt(variance / N)
    return t_score

In [26]:
def calculate_chi_square(c_bigram, c_word1, c_word2, N):
    O11 = c_bigram # C(w1w2)
    O12 = c_word1 - O11 # C(~w1w2)
    O21 = c_word2 - O11 # C(w1~w2)
    O22 = N - (O11 + O12 + O21) # C(~w1~w2)
    chi_square = (N * (O11 * O22 - O12 * O21) ** 2) / ((O11 + O12) * (O11 + O21) * (O12 + O22) * (O21 + O22))
    return chi_square

In [27]:
def calculate_likelihood_ratio(c_bigram, c_word1, c_word2, N):
    p = c_word2 / N
    p1 = c_bigram / c_word1
    p2 = (c_word2 - c_bigram) / (N - c_word1)

    # Calculate the hypothesis likelihoods
    LH1 = binom.pmf(c_bigram, c_word1, p) * binom.pmf(c_word2 - c_bigram, N - c_word1, p)
    LH2 = binom.pmf(c_bigram, c_word1, p1) * binom.pmf(c_word2 - c_bigram, N - c_word1, p2)

    # Replace the zero with some very small ε>0 to avoid zero-division or log(0)
    epsilon = np.finfo(float).eps
    LH1 = max(LH1, epsilon)
    LH2 = max(LH2, epsilon)

    likelihood_ratio = -2 * math.log(LH1 / LH2)
    return likelihood_ratio

In [28]:
def calculate_scores(all_bigrams, bigram_frequencies, N):
    scores = []
    for bigram in all_bigrams:
        c_bigram = bigram_frequencies[bigram]
        c_word1 = sum([f for b, f in bigram_frequencies.items() if bigram[0] in b])//2
        c_word2 = sum([f for b, f in bigram_frequencies.items() if bigram[1] in b])//2
        t_score = calculate_t_score(c_bigram, c_word1, c_word2, N)
        chi_square = calculate_chi_square(c_bigram, c_word1, c_word2, N)
        likelihood_ratio = calculate_likelihood_ratio(c_bigram, c_word1, c_word2, N)
        scores.append((bigram, t_score, chi_square, likelihood_ratio, c_bigram, c_word1, c_word2))
    return scores

In [29]:
scores_1 = calculate_scores(collocation_candidates_1, bigram_frequencies_1, N_1)

In [30]:
# Sort by t-score (1) or chi-square (2) or likelihood ratio (3)
sorted_t_score_1 = sorted(scores_1, key=lambda x: x[1], reverse=True)
sorted_chi_square_1 = sorted(scores_1, key=lambda x: x[2], reverse=True)
sorted_likelihood_ratio_1 = sorted(scores_1, key=lambda x: x[3], reverse=True)

In [31]:
data_t_score_1 = []
for i, (bigram, t_score, chi_square, likelihood_ratio, c_bigram, c_word1, c_word2) in enumerate(sorted_t_score_1[:20]):
    data_t_score_1.append({
        "Rank": i+1,
        "Bigram": f"{bigram[0]} {bigram[1]}",
        "t-score": t_score,
        "chi-square": chi_square,
        "likelihood-ratio": likelihood_ratio,
        "c(w1w2)": c_bigram,
        "c(w1)": c_word1,
        "c(w2)": c_word2
    })

data_chi_square_1 = []
for i, (bigram, t_score, chi_square, likelihood_ratio, c_bigram, c_word1, c_word2) in enumerate(sorted_chi_square_1[:20]):
    data_chi_square_1.append({
        "Rank": i+1,
        "Bigram": f"{bigram[0]} {bigram[1]}",
        "t-score": t_score,
        "chi-square": chi_square,
        "likelihood-ratio": likelihood_ratio,
        "c(w1w2)": c_bigram,
        "c(w1)": c_word1,
        "c(w2)": c_word2
    })

data_likelihood_ratio_1 = []
for i, (bigram, t_score, chi_square, likelihood_ratio, c_bigram, c_word1, c_word2) in enumerate(sorted_likelihood_ratio_1[:20]):
    data_likelihood_ratio_1.append({
        "Rank": i+1,
        "Bigram": f"{bigram[0]} {bigram[1]}",
        "t-score": t_score,
        "chi-square": chi_square,
        "likelihood-ratio": likelihood_ratio,
        "c(w1w2)": c_bigram,
        "c(w1)": c_word1,
        "c(w2)": c_word2
    })

In [32]:
df_sorted_t_score_1 = pd.DataFrame(data_t_score_1)
df_sorted_chi_square_1 = pd.DataFrame(data_chi_square_1)
df_sorted_likelihood_ratio_1 = pd.DataFrame(data_likelihood_ratio_1)

In [33]:
pd.set_option('display.float_format', '{:.6f}'.format)
print(df_sorted_t_score_1[['Rank', 'Bigram', 't-score', 'c(w1w2)', 'c(w1)', 'c(w2)']])

    Rank                   Bigram   t-score  c(w1w2)  c(w1)  c(w2)
0      1      stepan trofimovitch 22.619069      512    525    513
1      2       pyotr stepanovitch 22.547831      509    834    509
2      3         varvara petrovna 20.534433      422    474    507
3      4        katerina ivanovna 20.239065      410    427    635
4      5  nikolay vsyevolodovitch 17.657104      312    518    312
5      6        fyodor pavlovitch 17.052922      291    306    461
6      7                  old man 16.857563      289   1356   2546
7      8      nastasia philipovna 15.583748      243    417    251
8      9                young man 15.140569      232    776   2546
9     10                old woman 14.353161      208   1356   1047
10    11         yulia mihailovna 14.175298      201    215    202
11    12         pyotr petrovitch 13.100114      172    834    331
12    13    lizabetha prokofievna 13.074941      171    185    177
13    14               great deal 12.790646      164   1202   

In [34]:
pd.set_option('display.float_format', '{:.2f}'.format)
print(df_sorted_chi_square_1[['Rank', 'Bigram', 'chi-square', 'c(w1w2)', 'c(w1)', 'c(w2)']])

    Rank                  Bigram  chi-square  c(w1w2)  c(w1)  c(w2)
0      1     stepan trofimovitch  1387728.43      512    525    513
1      2    ippolit kirillovitch  1359440.81       41     43     41
2      3       lef nicolaievitch  1359440.81       41     43     41
3      4       avdotya romanovna  1341882.35      112    119    112
4      5        yulia mihailovna  1326304.32      201    215    202
5      6         nikodim fomitch  1316081.54       24     24     26
6      7   lizabetha prokofievna  1273169.85      171    185    177
7      8    mavriky nikolaevitch  1263071.68      132    149    132
8      9     trifon borissovitch  1235650.87       39     45     39
9     10      rodion romanovitch  1205266.43       82     97     82
10    11      mihail makarovitch  1197632.52       21     25     21
11    12  gavrila ardalionovitch  1173526.56       58     61     67
12    13        arina prohorovna  1120123.57       39     44     44
13    14        varvara petrovna  1056418.47    

In [35]:
pd.set_option('display.float_format', '{:.6f}'.format)
print(df_sorted_likelihood_ratio_1[['Rank', 'Bigram', 'likelihood-ratio', 'c(w1w2)', 'c(w1)', 'c(w2)']])

    Rank                 Bigram  likelihood-ratio  c(w1w2)  c(w1)  c(w2)
0      1   ippolit kirillovitch         69.521040       41     43     41
1      2      lef nicolaievitch         69.521040       41     43     41
2      3        nikodim fomitch         69.473603       24     24     26
3      4     mihail makarovitch         68.994638       21     25     21
4      5    trifon borissovitch         68.572449       39     45     39
5      6         kuzma kuzmitch         68.402672       20     29     20
6      7      avdotya romanovna         68.340263      112    119    112
7      8     semyon zaharovitch         68.147640       10     51     10
8      9    semyon yakovlevitch         67.918141       37     51     37
9     10  mavriky mavrikyevitch         67.912991       11    149     11
10    11     rodion romanovitch         67.697947       82     97     82
11    12                    o u         67.662663       14    225     14
12    13               von sohn         67.592173  

In [36]:
scores_3 = calculate_scores(collocation_candidates_3, bigram_frequencies_3, N_3)

In [37]:
# Sort by t-score (1) or chi-square (2) or likelihood ratio (3)
sorted_t_score_3 = sorted(scores_3, key=lambda x: x[1], reverse=True)
sorted_chi_square_3 = sorted(scores_3, key=lambda x: x[2], reverse=True)
sorted_likelihood_ratio_3 = sorted(scores_3, key=lambda x: x[3], reverse=True)

In [38]:
data_t_score_3 = []
for i, (bigram, t_score, chi_square, likelihood_ratio, c_bigram, c_word1, c_word2) in enumerate(sorted_t_score_3[:20]):
    data_t_score_3.append({
        "Rank": i+1,
        "Bigram": f"{bigram[0]} {bigram[1]}",
        "t-score": t_score,
        "chi-square": chi_square,
        "likelihood-ratio": likelihood_ratio,
        "c(w1w2)": c_bigram,
        "c(w1)": c_word1,
        "c(w2)": c_word2
    })

data_chi_square_3 = []
for i, (bigram, t_score, chi_square, likelihood_ratio, c_bigram, c_word1, c_word2) in enumerate(sorted_chi_square_3[:20]):
    data_chi_square_3.append({
        "Rank": i+1,
        "Bigram": f"{bigram[0]} {bigram[1]}",
        "t-score": t_score,
        "chi-square": chi_square,
        "likelihood-ratio": likelihood_ratio,
        "c(w1w2)": c_bigram,
        "c(w1)": c_word1,
        "c(w2)": c_word2
    })

data_likelihood_ratio_3 = []
for i, (bigram, t_score, chi_square, likelihood_ratio, c_bigram, c_word1, c_word2) in enumerate(sorted_likelihood_ratio_3[:20]):
    data_likelihood_ratio_3.append({
        "Rank": i+1,
        "Bigram": f"{bigram[0]} {bigram[1]}",
        "t-score": t_score,
        "chi-square": chi_square,
        "likelihood-ratio": likelihood_ratio,
        "c(w1w2)": c_bigram,
        "c(w1)": c_word1,
        "c(w2)": c_word2
    })

In [39]:
df_sorted_t_score_3 = pd.DataFrame(data_t_score_3)
df_sorted_chi_square_3 = pd.DataFrame(data_chi_square_3)
df_sorted_likelihood_ratio_3 = pd.DataFrame(data_likelihood_ratio_3)

In [40]:
pd.set_option('display.float_format', '{:.6f}'.format)
print(df_sorted_t_score_3[['Rank', 'Bigram', 't-score', 'c(w1w2)', 'c(w1)', 'c(w2)']])

    Rank                   Bigram   t-score  c(w1w2)  c(w1)  c(w2)
0      1      stepan trofimovitch 22.602372      512   1575   1539
1      2       pyotr stepanovitch 22.521479      509   2501   1526
2      3         varvara petrovna 20.518057      422   1421   1520
3      4        katerina ivanovna 20.220295      410   1281   1904
4      5  nikolay vsyevolodovitch 17.644302      312   1553    935
5      6        fyodor pavlovitch 17.041366      291    917   1381
6      7                  old man 16.603300      290   4066   7633
7      8      nastasia philipovna 15.574371      243   1249    752
8      9                young man 15.025591      234   2327   7633
9     10                old woman 14.459309      215   4066   3140
10    11         yulia mihailovna 14.171001      201    645    606
11    12                  o clock 13.221849      175    675    579
12    13    lizabetha prokofievna 13.071447      171    554    530
13    14         pyotr petrovitch 13.070649      172   2501   

In [41]:
pd.set_option('display.float_format', '{:.2f}'.format)
print(df_sorted_chi_square_3[['Rank', 'Bigram', 'chi-square', 'c(w1w2)', 'c(w1)', 'c(w2)']])

    Rank                  Bigram  chi-square  c(w1w2)  c(w1)  c(w2)
0      1     stepan trofimovitch   461893.16      512   1575   1539
1      2    ippolit kirillovitch   453091.95       41    129    123
2      3       lef nicolaievitch   453091.95       41    129    123
3      4       avdotya romanovna   447144.47      112    357    336
4      5        yulia mihailovna   441833.13      201    645    606
5      6         nikodim fomitch   438661.54       24     72     78
6      7   lizabetha prokofievna   425730.11      171    554    530
7      8    mavriky nikolaevitch   420847.60      132    447    396
8      9     trifon borissovitch   411831.33       39    135    117
9     10      mihail makarovitch   411103.42       21     74     62
10    11      rodion romanovitch   404676.92       82    290    245
11    12  gavrila ardalionovitch   391097.91       58    183    201
12    13        arina prohorovna   373322.26       39    132    132
13    14        varvara petrovna   352056.50    

In [42]:
pd.set_option('display.float_format', '{:.6f}'.format)
print(df_sorted_likelihood_ratio_3[['Rank', 'Bigram', 'likelihood-ratio', 'c(w1w2)', 'c(w1)', 'c(w2)']])

    Rank                 Bigram  likelihood-ratio  c(w1w2)  c(w1)  c(w2)
0      1     semyon zaharovitch         63.155763       10    153     30
1      2  mavriky mavrikyevitch         62.924804       11    447     33
2      3                je suis         62.794400       10    138     39
3      4           thou wouldst         62.682332       11    506     39
4      5             dark brown         62.548382       10    623     45
5      6               wisp tow         62.528363       14     54     48
6      7                    o u         62.443396       14    675     42
7      8             full speed         62.244908       14    906     48
8      9             eye glowed         62.241415       10   3588     57
9     10              eye shone         62.236514       11   3588     54
10    11       thousand roubles         62.172252       11   1840     57
11    12       literary matinée         62.063738       10    198     69
12    13            de cominges         62.061037  

In [43]:
scores_head_clerk = calculate_scores([('head', 'clerk')], bigram_frequencies_1, N_1)

In [44]:
df_scores_head_clerk = pd.DataFrame(scores_head_clerk, columns=["Bigram", "t-score", "chi-square", "likelihood-ratio", "c(w1w2)", "c(w1)", "c(w2)"])

In [45]:
print(df_scores_head_clerk)

          Bigram  t-score  chi-square  likelihood-ratio  c(w1w2)  c(w1)  c(w2)
0  (head, clerk) 4.674126 6294.816683         60.603199       22    801    136


In [46]:
scores_great_man = calculate_scores([('great', 'man')], bigram_frequencies_1, N_1)

In [47]:
df_scores_great_man = pd.DataFrame(scores_great_man, columns=["Bigram", "t-score", "chi-square", "likelihood-ratio", "c(w1w2)", "c(w1)", "c(w2)"])

In [48]:
print(df_scores_great_man)

         Bigram  t-score  chi-square  likelihood-ratio  c(w1w2)  c(w1)  c(w2)
0  (great, man) 3.736722  117.402985         45.158766       18   1202   2546
