In [111]:
dataset_path_file = "../../nechkasova-tokenizer/assets/annotated-corpus/train/alt.atheism/49960.tsv"
dataset_path = "../../nechkasova-tokenizer/assets/annotated-corpus/train/alt.atheism/"

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [113]:
import re
import string

def read_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

def clean_text(text):
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    return text.lower()

In [114]:
def process_file(file_path):
    lines = read_data(file_path)
    stems = []

    for line in lines:
        if line.strip():
            try:
                token, stem, lemma = line.strip().split('\t')
            except:
                continue
            cleaned_stems = clean_text(stem)
            if cleaned_stems and cleaned_stems not in stop_words:
                stems.append(cleaned_stems)
    
    return stems

In [115]:
import os

def process_directory(directory_path):
    all_stems = []

    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            # print(f"Process file {file_path}")
            stems = process_file(file_path)
            all_stems.append(stems)

    return all_stems

In [None]:
stems = process_directory(dataset_path)
all_stems = [stem_element for stem in stems for stem_element in stem]
print(len(all_stems))
stems

In [117]:
# Write test on it
def generate_bigrams(tokens):
    bigrams = []
    
    for token_group in tokens:
        for i in range(len(token_group) - 1):
            bigram = (token_group[i], token_group[i + 1])
            bigrams.append(bigram)
    
    return bigrams

# Write test on it
def generate_trigrams(tokens):
    trigrams = []
    
    for token_group in tokens:
        for i in range(len(token_group) - 2):
            trigram = (token_group[i], token_group[i + 1], token_group[i + 2])
            trigrams.append(trigram)
    
    return trigrams

In [None]:
trigrams = generate_trigrams(stems)
bigrams = generate_bigrams(stems)
# M - N + 1
print(len(all_stems) - 2 + 1, len(bigrams))
print(len(all_stems) - 3 + 1, len(trigrams))
trigrams

In [119]:
# Write test on it
def count_bigram_entry(bigram, bigrams):
    O_11 = 0
    O_12 = 0
    O_21 = 0
    O_22 = 0
    for bigram_entity in bigrams:
        try:
            index_to_remove = bigram.index(bigram_entity[0])
            if bigram_entity[1] == bigram[1 - index_to_remove]:
                O_11 += 1
            else:
                O_12 += 1
        except ValueError:
            try:
                index_to_remove = bigram.index(bigram_entity[1])
                O_21 += 1
            except ValueError:
                O_22 += 1
    
    return O_11, O_12, O_21, O_22

In [None]:
bigram_list = {('foo', 'bar'),
                ('foo', 'baz'),
                ('qux', 'quux'),
                ('quux', 'foo'),
                ('corge', 'bar'),
                ('bar', 'grault')}
bigram = ('foo', 'bar')

print(count_bigram_entry(bigram, bigram_list))

In [121]:
from nltk import FreqDist
            
# Write test on it
def count_token_freq(tokens):
    tokens_freq = FreqDist(tokens)
    return tokens_freq

In [122]:
# Write test on it
def count_trigram_entry(trigram, trigrams):
    O_111 = 0
    O_121 = 0
    O_211 = 0
    O_221 = 0
    
    O_112 = 0
    O_122 = 0
    O_212 = 0
    O_222 = 0
    
    for trigram_entity in trigrams:
        try:
            index_to_remove = trigram_entity.index(trigram[0])
            rest_trigram_elements = trigram_entity[:index_to_remove] + trigram_entity[index_to_remove+1:]
            try:
                index_to_remove = rest_trigram_elements.index(trigram[1])
                if trigram[2] == rest_trigram_elements[1 - index_to_remove]:
                    O_111 += 1
                else:
                    O_121 += 1
            except ValueError:
                try:
                    index_to_remove = rest_trigram_elements.index(trigram[2])
                    O_211 += 1
                except ValueError:
                    O_221 += 1
        except ValueError:
            try:
                index_to_remove = trigram_entity.index(trigram[1])
                rest_trigram_elements = trigram_entity[:index_to_remove] + trigram_entity[index_to_remove+1:]
                try:
                    index_to_remove = trigram_entity.index(trigram[2])
                    O_112 += 1
                except ValueError:
                    O_122 += 1
            except ValueError:
                try:
                    index_to_remove = trigram_entity.index(trigram[2])
                    rest_trigram_elements = trigram_entity[:index_to_remove] + trigram_entity[index_to_remove+1:]
                    O_212 += 1
                except ValueError:
                    O_222 += 1
    return O_111, O_121, O_211, O_221, O_112, O_122, O_212, O_222

def _contingency(n_iii, n_iix, n_ixi, n_ixx, n_xii, n_xix, n_xxi, n_xxx):
    """Calculates values of a trigram contingency table (or cube) from
    marginal values.
    >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
    (1, 0, 0, 0, 0, 72, 0, 1927)
    """
    n_oii = n_xii - n_iii
    n_ioi = n_ixi - n_iii
    n_iio = n_iix - n_iii
    n_ooi = n_xxi - n_iii - n_oii - n_ioi
    n_oio = n_xix - n_iii - n_oii - n_iio
    n_ioo = n_ixx - n_iii - n_ioi - n_iio
    n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo

    return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)

def count_trigram_entry_without_rearrangement(trigram, trigrams):
    O_111 = 0
    O_121 = 0
    O_211 = 0
    O_221 = 0
    
    O_112 = 0
    O_122 = 0
    O_212 = 0
    O_222 = 0
    
    for trigram_entity in trigrams:
        if trigram_entity[0] == trigram[0]:
            if trigram_entity[1] == trigram[1]:
                if trigram_entity[2] == trigram[2]:
                    O_111 += 1
                else:
                    O_121 += 1
            else:
                if trigram_entity[2] == trigram[2]:
                    O_211 += 1
                else:
                    O_221 += 1
        else:
            if trigram_entity[1] == trigram[1]:
                if trigram_entity[2] == trigram[2]:
                    O_112 += 1
                else:
                    O_122 += 1
            else:
                if trigram_entity[2] == trigram[2]:
                    O_212 += 1
                else:
                    O_222 += 1
                    
    return O_111, O_121, O_211, O_221, O_112, O_122, O_212, O_222

In [None]:
# Write test on it

tokens = ('summari', 'book', 'address', 'summer', 'book' 'magazines', 'address')
trigram_list = [('summari', 'book', 'address'),
                ('book', 'address', 'summer'),
                ('summer', 'book', 'address'),
                ('summer', 'book', 'magazines'),
                ('book', 'magazines', 'address')]
trigram = ('summer', 'book', 'address')
token = 'address'

print(count_trigram_entry(trigram, trigram_list))
print(count_trigram_entry_without_rearrangement(trigram, trigram_list))
print(count_token_freq(tokens)[token])

### t-score

In [124]:
import math

# Write test on it
def calculate_t_score_for_bigrams(bigram, bigrams, stems):
    try:
        total_tokens = len(all_stems)
        count_bigram = count_token_freq(bigrams)[bigram]
        
        token_freq = count_token_freq(all_stems)
        count_element_0 = token_freq[bigram[0]]
        count_element_1 = token_freq[bigram[1]]
        
        t_score = (count_bigram - (count_element_0 * count_element_1 / total_tokens)) / math.sqrt(count_bigram)
    except:
        return 0
    return t_score

In [None]:
tokens = ('foo', 'bar', 
          'foo', 'baz', 
          'qux', 'quux', 'foo', 
          'corge', 'bar', 'grault')
bigram_list = {('foo', 'bar'),
                ('foo', 'baz'),
                ('qux', 'quux'),
                ('quux', 'foo'),
                ('corge', 'bar'),
                ('bar', 'grault')}
bigram = ('foo', 'bar')

calculate_t_score_for_bigrams(bigram, bigram_list, tokens)

In [126]:
# Write test on it
def calculate_t_score_for_trigrams(trigram, trigram_list, tokens):
    total_tokens = len(tokens)
    count_trigram = count_token_freq(trigram_list)[trigram]
    
    if count_trigram == 0 or total_tokens == 0:
        return None
    
    token_freq = count_token_freq(tokens)
    count_element_0 = token_freq[trigram[0]]
    count_element_1 = token_freq[trigram[1]]
    count_element_2 = token_freq[trigram[2]]
    
    t_score = (count_trigram - (count_element_0 * count_element_1 * count_element_2 / total_tokens**2)) / math.pow(count_trigram, 1/3)
    
    return t_score


In [None]:
tokens = ('summari', 'book', 'address', 'summer', 'book' 'magazines', 'address')
trigram_list = [('summari', 'book', 'address'),
                ('book', 'address', 'summer'),
                ('address', 'summer', 'book'),
                ('summer', 'book', 'magazines'),
                ('book', 'magazines', 'address')]
trigram = ('address', 'summer', 'book')

calculate_t_score_for_trigrams(trigram, trigram_list, tokens)

### log-likelihood

In [128]:
_SMALL = 1e-20

def calculate_for_OE(O, E):
    if E == 0 or O == 0:
        return 0
    print(O, E)
    return O * math.log(O / (E + _SMALL) + _SMALL)

def calculate_log_likelihood_for_bigrams(bigram, bigrams, N):
    O_11, O_12, O_21, O_22 = count_bigram_entry(bigram, bigrams)
    R_1 = O_11 + O_12
    R_2 = O_21 + O_22
    C_1 = O_11 + O_21
    C_2 = O_12 + O_22
    
    E_11 = R_1 * C_1 / N
    E_12 = R_1 * C_2 / N
    E_21 = R_2 * C_1 / N
    E_22 = R_2 * C_2 / N

    log_likelihood = 2 * (calculate_for_OE(O_11, E_11) + calculate_for_OE(O_12, E_12) + calculate_for_OE(O_21, E_21) + calculate_for_OE(O_22, E_22))
    
    return log_likelihood

In [None]:
bigram_list = {('foo', 'bar'),
                ('foo', 'baz'),
                ('qux', 'quux'),
                ('quux', 'foo'),
                ('corge', 'bar'),
                ('bar', 'grault')}
bigram = ('foo', 'bar')
N = 10

calculate_log_likelihood_for_bigrams(bigram, bigram_list, N)

In [130]:
def calculate_log_likelihood_for_trigrams(trigram, trigrams, N):
    O_111, O_121, O_211, O_221, O_112, O_122, O_212, O_222 = count_trigram_entry_without_rearrangement(trigram, trigrams)
    print(O_111, O_121, O_211, O_221, O_112, O_122, O_212, O_222)
    R_1 = O_111 + O_121 + O_112 + O_122
    R_2 = O_211 + O_221 + O_212 + O_222
    C_1 = O_111 + O_211 + O_112 + O_212
    C_2 = O_121 + O_221 + O_122 + O_222
    B_1 = O_111 + O_121 + O_211 + O_221
    B_2 = O_112 + O_122 + O_212 + O_222
    
    E_111 = R_1 * C_1 * B_1 / N
    E_121 = R_1 * C_2 * B_1 / N
    E_211 = R_2 * C_1 * B_1 / N
    E_221 = R_2 * C_2 * B_1 / N
    
    E_112 = R_1 * C_1 * B_2 / N
    E_122 = R_1 * C_2 * B_2 / N
    E_212 = R_2 * C_1 * B_2 / N
    E_222 = R_2 * C_2 * B_2 / N

    log_likelihood = 2 * (calculate_for_OE(O_111, E_111) + calculate_for_OE(O_121, E_121) + calculate_for_OE(O_211, E_211) + calculate_for_OE(O_221, E_221) + 
                          calculate_for_OE(O_112, E_112) + calculate_for_OE(O_122, E_122) + calculate_for_OE(O_212, E_212) + calculate_for_OE(O_222, E_222))
    
    return log_likelihood

In [None]:
tokens = ('summari', 'book', 'address', 'summer', 'book' 'magazines', 'address', 'summmer', 
          'hot', 'water', 'summer',
          'address', 'line', 'water', 'sun', 'book')
trigram_list = [('summari', 'book', 'address'),
                ('book', 'address', 'summer'),
                ('summer', 'book', 'address'),
                ('summer', 'book', 'magazines'),
                ('book', 'magazines', 'address'),
                ('magazines', 'address', 'summer'),
                ('hot', 'water', 'summer'),
                ('address', 'line', 'water'),
                ('line', 'water', 'sun'),
                ('water', 'sun', 'book')]
trigram = ('book', 'address', 'summer')
N = len(tokens)

calculate_log_likelihood_for_trigrams(trigram, trigram_list, len(trigram_list))

In [None]:
from nltk.collocations import *

trigram_measures = nltk.collocations.TrigramAssocMeasures()

text = nltk.Text(tokens)

finder_thr = TrigramCollocationFinder.from_words(text)

print(finder_thr.score_ngram(trigram_measures.likelihood_ratio, 'book', 'address', 'summer'))
print(finder_thr.score_ngrams(trigram_measures.likelihood_ratio))

### Bigrams

In [78]:
bigrams_map = {}

for bigram in bigrams:
    t_score = calculate_t_score_for_bigrams(bigram, bigrams, all_stems)
    log_likelihood = calculate_log_likelihood_for_bigrams(bigram, bigrams, len(all_stems))
    bigrams_map[bigram] = {'t_score': t_score, 'log_likelihood': log_likelihood}

In [None]:
sorted_bigrams = sorted(bigrams_map.items(), key=lambda item: item[1]['t_score'], reverse=True)

for bigram, metrics in sorted_bigrams:
    print(f"{bigram}: t_score={metrics['t_score']}, log_likelihood={metrics['log_likelihood']}")

In [None]:
sorted_bigrams = sorted(bigrams_map.items(), key=lambda item: item[1]['log_likelihood'], reverse=True)

for bigram, metrics in sorted_bigrams:
    print(f"{bigram}: t_score={metrics['t_score']}, log_likelihood={metrics['log_likelihood']}")

In [None]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()

text = nltk.Text(all_stems)

finder_bi = BigramCollocationFinder.from_words(text)

print('Results with nltk realization\n')
print(f'Bigrams by student_t:\n{finder_bi.nbest(bigram_measures.student_t, 30)}\n')
print(f'Bigrams by log-likelihoog:\n{finder_bi.nbest(bigram_measures.likelihood_ratio, 30)}\n')

### Trigrams

In [37]:
trigrams_map = {}

for trigram in trigrams:
    t_score = calculate_t_score_for_trigrams(trigram, trigrams, all_stems)
    log_likelihood = calculate_log_likelihood_for_trigrams(trigram, trigrams, len(all_stems))
    trigrams_map[trigram] = {'t_score': t_score, 'log_likelihood': log_likelihood}

In [None]:
sorted_trigrams = sorted(trigrams_map.items(), key=lambda item: item[1]['t_score'], reverse=True)

for trigram, metrics in sorted_trigrams:
    print(f"{trigram}: t_score={metrics['t_score']}, log_likelihood={metrics['log_likelihood']}")

In [None]:
sorted_trigrams = sorted(trigrams_map.items(), key=lambda item: item[1]['log_likelihood'], reverse=True)

for trigram, metrics in sorted_trigrams:
    print(f"{trigram}: t_score={metrics['t_score']}, log_likelihood={metrics['log_likelihood']}")

In [None]:
from nltk.collocations import *

trigram_measures = nltk.collocations.TrigramAssocMeasures()

text = nltk.Text(all_stems)

finder_thr = TrigramCollocationFinder.from_words(text)

print('Results with nltk realization\n')
print(f'Trigrams by student_t:\n{finder_thr.nbest(trigram_measures.student_t, 30)}\n')
print(f'Trigrams by log-likelihoog:\n{finder_thr.nbest(trigram_measures.likelihood_ratio, 30)}')