In [8]:
import spacy
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load("fr_core_news_sm")

In [9]:
def context_count(window_size: int, file_path: str) -> pd.DataFrame:
    df_dict = {}
    
    with open(file_path ,"r", encoding="utf-8") as f:
        content = f.readlines()
        for line in content:
            doc = nlp(line)
            tokens = [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]
            for i, token in enumerate(tokens):
                if token not in df_dict:
                    df_dict[token] = {}
                
                start = max(0, i - window_size // 2)
                end = min(len(tokens), i + window_size // 2 + 1)
                
                window_words = tokens[start:i] + tokens[i+1:end]
                for window_word in window_words:
                    if window_word not in df_dict[token]:
                        df_dict[token][window_word] = 1
                    else:
                        df_dict[token][window_word] += 1
    df = pd.DataFrame(df_dict).fillna(0)
    return df

In [10]:
def generate_candidate_words(df: pd.DataFrame, *args: str) -> list[list]:
    candidate_words = []
    for word in args:
        word_vec = list(df[word])
        candidate_words.append([word, word_vec])
    return candidate_words

In [11]:
def df_cosine_sims(candidate_words: list) -> pd.DataFrame:
    df_dict_sim = {}
    for candidate_word in candidate_words:
        word = candidate_word[0]
        vec = candidate_word[1]
        if word not in df_dict_sim:
            df_dict_sim[word] = {}
        
        for other_candidate_word in candidate_words:
            other_word = other_candidate_word[0]
            other_vec = other_candidate_word[1]
            sim = cosine_similarity([vec], [other_vec])
            df_dict_sim[word][other_word] = sim[0][0]
    df_sim = pd.DataFrame(df_dict_sim)
    return df_sim


In [12]:
df_four = context_count(4, "./DEMOCRATIE_ET_CITOYENNETE.txt")
candidate_words_four = generate_candidate_words(df_four, "fiscal", "financier", "budget", "jardin", "démissionner")
df_cosine_sims(candidate_words_four)

Unnamed: 0,fiscal,financier,budget,jardin,démissionner
fiscal,1.0,0.260011,0.151894,0.006588,0.03294
financier,0.260011,1.0,0.124309,0.0,0.082223
budget,0.151894,0.124309,1.0,0.062994,0.062994
jardin,0.006588,0.0,0.062994,1.0,0.133333
démissionner,0.03294,0.082223,0.062994,0.133333,1.0


In [13]:
df_six = context_count(6, "./DEMOCRATIE_ET_CITOYENNETE.txt")
candidate_words_six = generate_candidate_words(df_six, "fiscal", "financier", "budget", "jardin", "démissionner")
df_cosine_sims(candidate_words_six)

Unnamed: 0,fiscal,financier,budget,jardin,démissionner
fiscal,1.0,0.347816,0.186287,0.008133,0.104638
financier,0.347816,1.0,0.220175,0.021574,0.093953
budget,0.186287,0.220175,1.0,0.054317,0.092161
jardin,0.008133,0.021574,0.054317,1.0,0.082479
démissionner,0.104638,0.093953,0.092161,0.082479,1.0
