# **IMPORTS**

In [None]:
!pip -q install --upgrade nltk

import nltk
import time 
import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy.stats import binom

from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords

nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')

pd.set_option('display.float_format', lambda x: f'{x:.6f}')

# **HELPER FUNCTIONS**

In [2]:
############# HELPER #############
class CustomLemmatizer:
    tag_dict = {"ADJ": wordnet.ADJ,
                "NOUN": wordnet.NOUN}
    lemmatizer = WordNetLemmatizer()

    def lemmatize(self, word_pos_tuple):
        word = word_pos_tuple[0]
        pos_tag = word_pos_tuple[1]
        if pos_tag in self.tag_dict:
            return self.lemmatizer.lemmatize(word, self.tag_dict[pos_tag]).lower()
        else:
            return word.lower()

def bigram_counter(text, window_size):
    temp_list = []
    for i in tqdm(range(len(text) - 1)):
        for j in range(window_size):
            try:
                collacation = [text[i],text[i+j+1]]
                temp_list.append(collacation)
            except:
                pass
    bigram_list, bigram_counts = np.unique(temp_list, axis=0, return_counts=True)
    bigram_dict = {' '.join(key): value for key, value in zip(bigram_list.tolist(), bigram_counts.tolist())}
    return bigram_list.tolist(), bigram_counts.tolist(), bigram_dict

def filter_collacations(lemmatized_tokens, pos_tags, window_size, min_frequency=10):
    stop_words = set(stopwords.words('english'))
    collocation_candidates = []
    asd = []
    for i in tqdm(range(len(lemmatized_tokens)-1)):
        for j in range(window_size):
            try:
                bigram = (lemmatized_tokens[i], lemmatized_tokens[i+j+1])
                asd.append(bigram)
                pos_tag_coll = [pos_tags[i][1], pos_tags[i+j+1][1]]
                if pos_tag_coll == ['ADJ', 'NOUN'] or pos_tag_coll == ['NOUN', 'NOUN']:
                    # Eliminate stopwords
                    if bigram[0].lower() not in stop_words and bigram[1].lower() not in stop_words:
                        # Eliminate punctuations
                        if all(char.isalpha() for char in bigram[0]) and all(char.isalpha() for char in bigram[1]):
                            # Eliminate bigrams with frequency less than min_frequency
                            collocation_candidates.append(bigram)
            except:
                pass

    _,_, bigram_dict = bigram_counter(lemmatized_tokens, window_size)
    temp_col = np.unique(collocation_candidates, axis=0).tolist()
    filtered_col = []
    filtered_counts = []

    for j, col in enumerate(temp_col):
        if bigram_dict[" ".join(col)] >= min_frequency:
            filtered_col.append(col)
            filtered_counts.append(bigram_dict[" ".join(col)])

    return filtered_col, filtered_counts

def statistical_test_collacation(bigram_list_count, token_list, method, window_size=1):
    c_w1 = []
    c_w2 = []
    c_w1w2 = []
    collacation = []
    score_list = []
    total_token = len(token_list)
    
    eps = 5e-324 # math.ulp(0.0)
    N = window_size * total_token

    words, counts = np.unique(token_list, return_counts=True)
    word_freq_dict = dict(zip(words.tolist(), counts.tolist()))

    for bigram, count in bigram_list_count:
        w1w2 = count
        w1 = window_size * word_freq_dict[bigram[0]]
        w2 = window_size * word_freq_dict[bigram[1]]
        
        if method == "t-test":
            x_bar = w1w2 / N
            mu = w1*w2 / N ** 2
            s_2 = x_bar # actual value is x_bar * (1 - x_bar), since x_bar is very
            # close to 0, assumed that (1 - x_bar) = 1
            
            test_score = (x_bar - mu) / np.sqrt(s_2 / N)
        
        elif method == "chi-square test":
            O11 = w1w2
            O12 = w2 - O11
            O21 = w1 - O11
            O22 = N - O12 - O21
            
            num = N * (O11*O22 - O12*O21) ** 2
            denom = (O11 + O12) * (O11 + O21) * (O12 + O22) * (O21 + O22)
            
            test_score = num / denom
            
        elif method == "likelihood ratio test":
            p = w2 / N
            p1 = w1w2 / w1
            p2 = (w2 - w1w2) / (N - w1)
            
            L_H1 = binom.pmf(w1w2, w1, p) * binom.pmf(w2-w1w2, N-w1, p)
            L_H2 = binom.pmf(w1w2, w1, p1) * binom.pmf(w2-w1w2, N-w1, p2)

            if L_H1 == 0:
                L_H1 = eps
            if L_H2 == 0:
                L_H2 = eps
            
            test_score = -2 * np.log(L_H1 / L_H2)
        
        else:
            raise ValueError(f"There is no {method}")

        c_w1.append(int(w1/window_size))
        c_w2.append(int(w2/window_size))
        c_w1w2.append(w1w2)
        collacation.append(" ".join(bigram))
        score_list.append(test_score)

    data = {
        "Bi-gram": collacation,
        f"{method[:-4]}score": score_list,
        "c(w1w2)": c_w1w2,
        "c(w1)": c_w1,
        "c(w2)": c_w2
    }

    df = pd.DataFrame(data)
    df = df.sort_values(by=f"{method[:-4]}score", ascending=False)
    df.index = pd.RangeIndex(start=1, stop=len(df) + 1, name="rank")
    return df

# **PART 1**

In [10]:
############# Part1 a #############
t0 = time.time()
file_path = "Fyodor Dostoyevski Processed.txt"
with open(file_path, "r") as file:
    text = file.read()
elapsed_time = time.time() - t0
print(f"Part 1a elapsed time: {elapsed_time:.4f} seconds")

############# Part1 b #############
t0 = time.time()
tokenized_text = word_tokenize(text)
elapsed_time = time.time() - t0
print(f"Part 1b elapsed time: {elapsed_time:.4f} seconds")

############# Part1 c #############
t0 = time.time()
pos_tags = pos_tag(tokenized_text, tagset="universal")
elapsed_time = time.time() - t0
print(f"Part 1c elapsed time: {elapsed_time:.4f} seconds")

############# Part1 d #############
t0 = time.time()
lemmatizer = CustomLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token_pos) for token_pos in tqdm(pos_tags)]
elapsed_time = time.time() - t0
print(f"Part 1d elapsed time: {elapsed_time:.4f} seconds")

############# Part1 e #############
t0 = time.time()
bigrams_size_1, bigrams_size_1_counts, _ = bigram_counter(lemmatized_tokens, window_size=1)
bigrams_size_3, bigrams_size_3_counts, _  = bigram_counter(lemmatized_tokens, window_size=3)
elapsed_time = time.time() - t0
print(f"Part 1e elapsed time: {elapsed_time:.4f} seconds")

############# Part1 f #############
t0 = time.time()
bigrams_filtered_size_1, bigrams_filtered_size_1_counts = filter_collacations(lemmatized_tokens, pos_tags, window_size=1, min_frequency=10)
bigrams_filtered_size_3, bigrams_filtered_size_3_counts = filter_collacations(lemmatized_tokens, pos_tags, window_size=3, min_frequency=10)
elapsed_time = time.time() - t0
print(f"Part 1f elapsed time: {elapsed_time:.4f} seconds")

# Print the number of bi-grams
print(f"\nThe number of bi-grams with window_size=1 is\n\tbefore filtering: {len(bigrams_size_1)} and after filtering: {len(bigrams_filtered_size_1)}")
print(f"\nThe number of bi-grams with window_size=3 is\n\tbefore filtering: {len(bigrams_size_3)} and after filtering: {len(bigrams_filtered_size_3)}")

Part 1a elapsed time: 0.0561 seconds
Part 1b elapsed time: 15.9923 seconds
Part 1c elapsed time: 58.6910 seconds


100%|██████████| 1425758/1425758 [00:02<00:00, 508197.83it/s]


Part 1d elapsed time: 2.8147 seconds


100%|██████████| 1425757/1425757 [00:03<00:00, 471673.65it/s]
100%|██████████| 1425757/1425757 [00:06<00:00, 224608.17it/s]


Part 1e elapsed time: 36.7585 seconds


100%|██████████| 1425757/1425757 [00:01<00:00, 823034.60it/s]
100%|██████████| 1425757/1425757 [00:02<00:00, 613840.46it/s]
100%|██████████| 1425757/1425757 [00:04<00:00, 310450.53it/s]
100%|██████████| 1425757/1425757 [00:05<00:00, 243648.02it/s]


Part 1f elapsed time: 41.6350 seconds

The number of bi-grams with window_size=1 is
	before filtering: 280830 and after filtering: 438

The number of bi-grams with window_size=3 is
	before filtering: 897060 and after filtering: 790


**Part 1 Answers**

In [11]:
words, counts = np.unique(lemmatized_tokens, return_counts=True)
word_count_dict = dict(zip(words.tolist(), counts.tolist()))

idx1 = bigrams_size_1.index(["magnificent", "capital"])
idx2 = bigrams_size_3.index(["bright", "fire"])

In [12]:
print("Part1b")
print(f"What is the number of tokens in the corpus? N = {len(tokenized_text)}")
print("Part1d")
print(f"Counts of word [that]   = {word_count_dict['that']}")
print(f"Counts of word [the]    = {word_count_dict['the']}")
print(f"Counts of word [abject] = {word_count_dict['abject']}")
print(f"Counts of word [london] = {word_count_dict['london']}")
print(f"Counts of word [.]      = {word_count_dict['.']}")
print("Part1e")
print(f"'magnificent capital' occur in windows of size 1: {bigrams_size_1_counts[idx1]}")
print(f"'bright fire' occur in windows of size 1: {bigrams_size_3_counts[idx2]}")
print("Part1f")
try:
    idx3 = bigrams_filtered_size_1.index(["mr.", "skimpole"])
    print(f"'Mr. Skimpole' occur in windows of size 1: {bigrams_size_3_counts[idx3]}")
except:
    print("'Mr. Skimpole' doesnt exists after filtering")
try:
    idx4 = bigrams_filtered_size_3.index(["spontaneous", "combustion"])
    print(f"'spontaneous combustion' occur in windows of size 1: {bigrams_size_3_counts[idx4]}")
except:
    print("'spontaneous combustion' doesnt exists after filtering")

Part1b
What is the number of tokens in the corpus? N = 1425758
Part1d
Counts of word [that]   = 19429
Counts of word [the]    = 48392
Counts of word [abject] = 21
Counts of word [london] = 2
Counts of word [.]      = 51738
Part1e
'magnificent capital' occur in windows of size 1: 1
'bright fire' occur in windows of size 1: 1
Part1f
'Mr. Skimpole' doesnt exists after filtering
'spontaneous combustion' doesnt exists after filtering


# **PART 2**

In [13]:
# t-test, Chi-Sqaure test and Likelihood Ratio Test for Collacation Candidates with window_size=1
df_size_1_t_test   = statistical_test_collacation(zip(bigrams_filtered_size_1, bigrams_filtered_size_1_counts), lemmatized_tokens, method = "t-test", window_size = 1)
df_size_1_chi_test = statistical_test_collacation(zip(bigrams_filtered_size_1, bigrams_filtered_size_1_counts), lemmatized_tokens, method = "chi-square test", window_size = 1)
df_size_1_mle_test = statistical_test_collacation(zip(bigrams_filtered_size_1, bigrams_filtered_size_1_counts), lemmatized_tokens, method = "likelihood ratio test", window_size = 1)

# t-test, Chi-Sqaure test and Likelihood Ratio Test for Collacation Candidates with window_size=3
df_size_3_t_test   = statistical_test_collacation(zip(bigrams_filtered_size_3, bigrams_filtered_size_3_counts), lemmatized_tokens, method = "t-test", window_size = 3)
df_size_3_chi_test = statistical_test_collacation(zip(bigrams_filtered_size_3, bigrams_filtered_size_3_counts), lemmatized_tokens, method = "chi-square test", window_size = 3)
df_size_3_mle_test = statistical_test_collacation(zip(bigrams_filtered_size_3, bigrams_filtered_size_3_counts), lemmatized_tokens, method = "likelihood ratio test", window_size = 3)

In [14]:
print("Statistical Test Analysis for Bigrams with window_size=1")
print(df_size_1_t_test.head(20))
print("\n")
print(df_size_1_chi_test.head(20))
print("\n")
print(df_size_1_mle_test.head(20))
print("\n")
print("\n")
print("Statistical Test Analysis for Bigrams with window_size=3")
print(df_size_3_t_test.head(20))
print("\n")
print(df_size_3_chi_test.head(20))
print("\n")
print(df_size_3_mle_test.head(20))

Statistical Test Analysis for Bigrams with window_size=1
                      Bi-gram   t-score  c(w1w2)  c(w1)  c(w2)
rank                                                          
1         stepan trofimovitch 22.619069      512    525    513
2          pyotr stepanovitch 22.547831      509    834    509
3            varvara petrovna 20.534433      422    474    507
4           katerina ivanovna 20.239065      410    427    635
5     nikolay vsyevolodovitch 17.657104      312    518    312
6           fyodor pavlovitch 17.052922      291    306    461
7                     old man 16.857563      289   1356   2546
8         nastasia philipovna 15.583748      243    417    251
9                   young man 15.140569      232    776   2546
10                  old woman 14.353161      208   1356   1047
11           yulia mihailovna 14.175298      201    215    202
12           pyotr petrovitch 13.100114      172    834    331
13      lizabetha prokofievna 13.074941      171    185    17

# **PART 3**

In [15]:
idx1 = bigrams_filtered_size_1.index(["head", "clerk"])
idx2 = bigrams_filtered_size_1.index(["great", "man"])

count1 = bigrams_filtered_size_1_counts[idx1]
count2 = bigrams_filtered_size_1_counts[idx2]

bigrams_list = [["head", "clerk"], ["great", "man"]]
bigrams_count = [count1, count2]

# t-test, Chi-Sqaure test and Likelihood Ratio Test for 2 Specific Collacation with window_size=1
df_size_1_t_test_sub   = statistical_test_collacation(zip(bigrams_list, bigrams_count), lemmatized_tokens, method = "t-test", window_size = 1)
df_size_1_chi_test_sub = statistical_test_collacation(zip(bigrams_list, bigrams_count), lemmatized_tokens, method = "chi-square test", window_size = 1)
df_size_1_mle_test_sub = statistical_test_collacation(zip(bigrams_list, bigrams_count), lemmatized_tokens, method = "likelihood ratio test", window_size = 1)

In [16]:
print("Statistical Test Analysis for Subset of Bigrams with window_size=1")
print("\n")
print(df_size_1_t_test_sub)
print("\n")
print(df_size_1_chi_test_sub)
print("\n")
print(df_size_1_mle_test_sub)

Statistical Test Analysis for Subset of Bigrams with window_size=1


         Bi-gram  t-score  c(w1w2)  c(w1)  c(w2)
rank                                            
1     head clerk 4.674126       22    801    136
2      great man 3.736722       18   1202   2546


         Bi-gram  chi-square score  c(w1w2)  c(w1)  c(w2)
rank                                                     
1     head clerk       6294.821739       22    801    136
2      great man        117.403486       18   1202   2546


         Bi-gram  likelihood ratio score  c(w1w2)  c(w1)  c(w2)
rank                                                           
1     head clerk              209.662656       22    801    136
2      great man               45.158788       18   1202   2546
