# Lab 4

## Task 1-2

In [169]:
import numpy as np
import pandas as pd
import math
import nltk
from nltk.book import *
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
import warnings
warnings.filterwarnings('ignore')

In [170]:
from nltk.book import text1, text2, text3, text4

In [171]:
texts_raw = [" ".join(text1.tokens)[:1000], " ".join(text2.tokens)[:1000],  " ".join(text3.tokens)[:1000]]

In [172]:
def remove_punctuation(words):
    return [word for word in words if word not in string.punctuation]

def preprocess_text(text):
    sentences = sent_tokenize(text)
    corpus = [word_tokenize(sentence) for sentence in sentences]
    corpus = [remove_punctuation(sentence) for sentence in corpus]
    return corpus

corpus = [preprocess_text(text) for text in texts_raw]
corpus = [item for sublist in corpus for item in sublist] 

In [173]:
from collections import Counter, defaultdict
import numpy as np
from math import log2

def compute_co_occurrences(corpus, window_size=None):
    co_occurrences = defaultdict(Counter)
    total_co_occurrences = 0
    
    for paragraph in corpus:
        for i, word in enumerate(paragraph):
            if window_size is None:
                context_words = paragraph[:i] + paragraph[i+1:]
            else:
                start = max(0, i - window_size // 2)
                end = min(len(paragraph), i + window_size // 2 + 1)
                context_words = paragraph[start:i] + paragraph[i+1:end]
                
            for context_word in context_words:
                co_occurrences[word][context_word] += 1
                total_co_occurrences += 1
                
    return co_occurrences, total_co_occurrences

def compute_ppmi(co_occurrences, total_co_occurrences):
    word_frequencies = Counter({word: sum(contexts.values()) for word, contexts in co_occurrences.items()})
    total_words = sum(word_frequencies.values())
    
    ppmi_matrix = defaultdict(dict)
    for word, contexts in co_occurrences.items():
        for context_word, count in contexts.items():
            p_wc = count / total_co_occurrences
            p_w = word_frequencies[word] / total_co_occurrences
            p_c = word_frequencies[context_word] / total_co_occurrences
            ppmi = max(log2(p_wc / (p_w * p_c)), 0)
            ppmi_matrix[word][context_word] = ppmi
            
    return ppmi_matrix

In [174]:
co_occurrences_para, total_co_occurrences_para = compute_co_occurrences(corpus)
ppmi_matrix_para = compute_ppmi(co_occurrences_para, total_co_occurrences_para)
ppmi_matrix_para_df = pd.DataFrame.from_dict(ppmi_matrix_para,orient='index').fillna(0)

print(ppmi_matrix_para_df)

              Dick        by    Herman  Melville      1851  ETYMOLOGY  \
Moby      8.411511  4.411511  8.411511  8.411511  8.411511   8.411511   
by        4.411511  0.000000  4.411511  4.411511  4.411511   4.411511   
Herman    8.411511  4.411511  0.000000  8.411511  8.411511   8.411511   
Melville  8.411511  4.411511  8.411511  0.000000  8.411511   8.411511   
1851      8.411511  4.411511  8.411511  8.411511  0.000000   8.411511   
...            ...       ...       ...       ...       ...        ...   
moved     0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
...       0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
Sw        0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
WALLEN    0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
A         0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   

              Moby  Supplied         a      Late  ...    Spirit  moved  \
Moby      0.000000  0.000000  0.000000  0.000000 

In [176]:
co_occurrences_window, total_co_occurrences_window = compute_co_occurrences(corpus, window_size= 5)
ppmi_matrix_window = compute_ppmi(co_occurrences_window, total_co_occurrences_window)
ppmi_matrix_window_df = pd.DataFrame.from_dict(ppmi_matrix_window, orient='index').fillna(0)

print(ppmi_matrix_window_df)

              Dick        by      Moby    Herman  Melville  Supplied  \
Moby      8.330917  6.008989  0.000000  0.000000  0.000000  0.000000   
by        5.424026  0.000000  6.008989  5.008989  5.008989  6.008989   
Herman    7.330917  5.008989  0.000000  0.000000  6.915879  0.000000   
Dick      0.000000  5.424026  8.330917  7.330917  0.000000  0.000000   
Melville  0.000000  5.008989  0.000000  6.915879  0.000000  0.000000   
...            ...       ...       ...       ...       ...       ...   
spent     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
...       0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
Sw        0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
WALLEN    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
A         0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

                 a      Late     teach      them  ...  1811  long  they  \
Moby      0.000000  0.000000  0.000000  0.000000  ...   0.0 

## Task 3

Щоб усунути проблему високої контрастності значень у матриці співвживань слів, одним із підходів є застосування сигмоїдальної функції до кожного значення матриці. Сигмоїда перетворює значення таким чином, що різниця між великими та малими значеннями стає менш вираженою. Такий метод може допомогти зменшити контраст між значеннями, зробивши розподіл більш гладким і зменшуючи вплив рідкісних слів, які мають високі значення PMI.

In [177]:
import numpy as np
import pandas as pd
def apply_sigmoid(matrix):
    sigmoid_matrix = 1 / (1 + np.exp(-matrix))
    return sigmoid_matrix

sigmoid_ppmi_matrix = apply_sigmoid(ppmi_matrix_window_df)
sigmoid_ppmi_matrix_df = pd.DataFrame(sigmoid_ppmi_matrix)
sigmoid_ppmi_matrix_df


Unnamed: 0,Dick,by,Moby,Herman,Melville,Supplied,a,Late,teach,them,...,1811,long,they,acquaintance,But,loss,days,comfortably,spent,unto
Moby,0.999759,0.997549,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5
by,0.995610,0.500000,0.997549,0.993367,0.993367,0.997549,0.843837,0.993367,0.993367,0.982172,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5
Herman,0.999345,0.993367,0.500000,0.500000,0.999009,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5
Dick,0.500000,0.995610,0.999759,0.999345,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5
Melville,0.500000,0.993367,0.500000,0.999009,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
spent,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.999759,0.5,0.5
...,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5
Sw,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5
WALLEN,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.500000,0.5,0.5


## Task 4

In [178]:
words_synonyms = {
    'skeleton': ['structure', 'framework', 'shell'],
    'good': ['reasonable', 'logical', 'well-founded'],
    'beautiful': ['lovely', 'elegant', 'pretty'],
    'mood': ['spirit', 'mode', 'cheer'],
    'breaking': ['disrupting', 'fracturing', 'fragmenting'],
    'record': ['chronology', 'version', 'tale'],
    'amazing': ['awesome', 'incredible', 'marvellous'],
    'unhappy': ['unhappy', 'worried', 'gloomy'],
    'try': ['push', 'tax', 'annoy'],
    'said': ['aforementioned', 'such', 'above'],
}

corpus = [[word] + synonyms for word, synonyms in words_synonyms.items()]

co_occurrences, total_co_occurrences = compute_co_occurrences(corpus, window_size=5)
ppmi_matrix = compute_ppmi(co_occurrences, total_co_occurrences)

ppmi_matrix_df = pd.DataFrame.from_dict(ppmi_matrix, orient='index').fillna(0)

print(ppmi_matrix_df)

                structure  framework  skeleton     shell  reasonable  \
skeleton         4.058894   4.058894  0.000000  0.000000    0.000000   
framework        3.473931   0.000000  4.058894  4.058894    0.000000   
shell            4.058894   4.058894  0.000000  0.000000    0.000000   
structure        0.000000   3.473931  4.058894  4.058894    0.000000   
good             0.000000   0.000000  0.000000  0.000000    4.058894   
logical          0.000000   0.000000  0.000000  0.000000    3.473931   
well-founded     0.000000   0.000000  0.000000  0.000000    4.058894   
reasonable       0.000000   0.000000  0.000000  0.000000    0.000000   
beautiful        0.000000   0.000000  0.000000  0.000000    0.000000   
elegant          0.000000   0.000000  0.000000  0.000000    0.000000   
pretty           0.000000   0.000000  0.000000  0.000000    0.000000   
lovely           0.000000   0.000000  0.000000  0.000000    0.000000   
mood             0.000000   0.000000  0.000000  0.000000    0.00

PPMI може певною мірою відображати семантичну подібність, особливо коли слова часто використовуються разом у подібних контекстах, що може свідчити про їх синонімічність або тісний семантичний зв'язок. Однак, PPMI не завжди може точно відтворити всі нюанси семантичних відносин, оскільки вона не враховує контекстуальні відмінності та інші фактори, такі як стилістичні варіації.