# Laboratory work #2 (n-grams)

In [None]:
import os


import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
from nltk.util import ngrams
from nltk.corpus import stopwords
from collections import Counter

import math
from nltk.probability import FreqDist
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures

import matplotlib.pyplot as plt
from pandas.errors import EmptyDataError

In [None]:
def read_files(root_dir, n=None):
    file_paths = []

    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.tsv'):
                file_paths.append(os.path.join(subdir, file))

    all_data = []
    for i, file_path in enumerate(file_paths):
        if n is not None and i >= n:
            break
        try:
            rows = []
            sentence_index = 0

            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        token, stem, lemma = line.split('\t')
                        rows.append({'Token': token, 'Stem': stem, 'Lemma': lemma, 'Sentence_Index': sentence_index})
                    else:
                        sentence_index += 1

            file_data = pd.DataFrame(rows)
            if len(file_data) >= 1:
                all_data.append(file_data)
        except EmptyDataError as e:
            print(i, file_path, e)
    ids = [os.path.splitext(os.path.basename(path))[0] for path in file_paths]
    return ids, all_data


In [None]:
train_ids, train = read_files('../assets/annotated-corpus/train', 
                #    1000
                   )
# val_ids, val = read_files('../assets/annotated-corpus/val', 
#                 #  100
#                  )
test_ids, test = read_files('../assets/annotated-corpus/test', 
                #   100
                  )

In [None]:
train[1].head(40)

In [None]:
len(train)

In [None]:
def get_n_gram_freq(df_list, language='english'):
    stop_words = set(stopwords.words(language))
    
    n_grams = []
    for df in df_list:
        try:
            stems = df['Stem'].apply(lambda x: re.sub(r'[^\w\s]', '', x)).str.lower()
            stems = [stem for stem in stems if stem not in stop_words and stem.strip() != '']
            trigrams = list(ngrams(stems, 3))
            n_grams.extend(trigrams)
        except:
            print('error with df:')
            print(df)

    n_gram_freq = Counter(n_grams)
    return n_gram_freq

In [None]:
train_freq = get_n_gram_freq(train)

In [None]:
train_freq

In [None]:
def calculate_MI(n_grams, total_words, word_freq):
    mi_scores = {}
    for n_gram in n_grams:
        p_n_gram = n_grams[n_gram]
        p_w1 = word_freq[n_gram[0]]
        p_w2 = word_freq[n_gram[1]] 

        mi_score = math.log2(p_n_gram * pow(total_words, 2) / (p_w1 * p_w2))
        mi_scores[n_gram] = mi_score
    return mi_scores


def get_mi_scores(freq):
    total_words = sum(freq.values())
    word_freq = FreqDist(word for trigram in freq for word in trigram)
    mi_scores = calculate_MI(freq, total_words, word_freq)
    return mi_scores


def get_mi_scores_nltk(df_list, language='english'):
    stop_words = set(stopwords.words(language))
    full_text = ''
    
    for df in df_list:
        try:
            words = df['Stem'].apply(lambda x: re.sub(r'[^\w\s]', '', x)).str.lower()
            words = [word for word in words if word not in stop_words and word.strip() != '']
            full_text += ' '.join(words)
        except:
            print('error with df:')
            print(df)
        
    tokens = nltk.word_tokenize(full_text, language, True)
    text = nltk.Text(tokens)
        
    trigram_measures = TrigramAssocMeasures()
    finder = TrigramCollocationFinder.from_words(text)
    nltk_mi_scores = finder.score_ngrams(trigram_measures.pmi)
    return nltk_mi_scores

In [None]:
train_mi_scores = get_mi_scores(train_freq)

In [None]:
train_mi_scores_nltk = get_mi_scores_nltk(train)

In [None]:
n = 30
sorted_mi_scores = sorted(train_mi_scores.items(), key=lambda x: x[1], reverse=True)[:n]
print(f'Top {n} trigrams MI:')
for trigram, score in sorted_mi_scores:
    print(f'{trigram}: {score}')

In [None]:
n = 30
sorted_mi_scores = sorted(train_mi_scores.items(), key=lambda x: x[1], reverse=False)
print(f'Last top {n} trigrams MI:')
for trigram, score in sorted_mi_scores[:n]:
    print(f'{trigram}: {score}')

In [None]:
print(f'Top {n} trigrams MI with nltk:')
for trigram, score in train_mi_scores_nltk[:n]:
    print(f'{trigram}: {score}')

In [None]:
nltk_mi_dict = dict(train_mi_scores_nltk)

matched_mi_scores = []

for trigram, mi_score in sorted_mi_scores:
    if trigram in nltk_mi_dict:
        matched_mi_scores.append((trigram, mi_score, nltk_mi_dict[trigram]))

# for trigram, own_mi, nltk_mi in matched_mi_scores:
#     print(f'Trigram: {trigram}, My MI: {own_mi}, NLTK MI: {nltk_mi}')

In [None]:
df = pd.DataFrame(matched_mi_scores, columns=['Trigram', 'MI', 'NLTK_MI']).iloc[:1000, :]
x_indexes = range(len(df))

plt.figure(figsize=(15, 8))
plt.plot(x_indexes, df['MI'], label='MI calculation', marker='o')
plt.plot(x_indexes, df['NLTK_MI'], label='NLTK MI calculation', marker='x')

plt.xlabel('Trigram Index')
plt.ylabel('MI Score')
plt.title('Comparison of MI scores between own calculation and NLTK')

plt.legend()
plt.show()

In [None]:
df = pd.DataFrame(matched_mi_scores, columns=['Trigram', 'MI', 'NLTK_MI'])
x_indexes = range(len(df))

plt.figure(figsize=(15, 8))
plt.plot(x_indexes, df['NLTK_MI'], label='NLTK MI calculation', marker='x', alpha=0.1)
plt.plot(x_indexes, df['MI'], label='MI calculation', marker='o', alpha=0.9)

plt.xlabel('Trigram Index')
plt.ylabel('MI Score')
plt.title('Comparison of MI scores between own calculation and NLTK')

plt.legend()
plt.show()

In [None]:
df.head()

In [None]:
df.sample(10)