# Analyse du mot 'personne' dans les avis du CCNE   

Code avec : 
tirage d'une base totalement aléatoire filtrée avec les mots d'intérêt pour exploration et rédaction de la guideline d'annotation
export de cette base

embedding des mots choisis

tirage d'un set représentatif de phrases.

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
# The autoreload extension is already loaded. To reload it, use:
#%reload_ext autoreload
import sys
sys.path.append('./helpers')  # Add the src directory to the Python path

seed = 1968

from database_creation import open_avis, join_metadata, corpus_to_sentences_with_context, filter_sentences_with_words, has_words
from display import interactive_sentence_display, generate_sentences_pdf

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leopo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Base mise au niveau de la phrase, et exemples pour guideline d'annotation

In [2]:
# Step 1: Open Avis
base_avis_ccne = open_avis(rescrap_texte=False)

# Step 2: Join Metadata
base_avis_ccne = join_metadata(base_avis_ccne)

# Step 3: Transform to Sentences with Context
base_sentences = corpus_to_sentences_with_context(base = base_avis_ccne)

# Step 4: Filter on sentences with 'personne' in it
base_filtered = filter_sentences_with_words(base_sentences,["personne","personnes",'humain','humains','humaine','humaines'])

# Step 5: Sample 100 sentences
samples = base_filtered.sample(100, random_state = seed)
# le tirage original fut fait avec ["personne","personnes","humain","humains","humaine","humaines"]
# et surtout une fonction has_words qui regardait si la chaine "personne" était présente et donc "personnel" sortait positif.


              corpus_to_sentences_with_context :
              is  the number of sentences counted corresponding to the number of rows by 'num' ?
              True


In [3]:
# Step 6: Display samples
interactive_sentence_display(samples)



ToggleButtons(options=('Show Metadata', 'Hide Metadata'), value='Show Metadata')

Button(description='Previous', style=ButtonStyle())

Button(description='Next', style=ButtonStyle())

Output()

In [4]:
generate_sentences_pdf(samples, "../output/exploration/phrase_personne_humain.pdf")

# Présence lexicale et export

## Présence lexicale

In [3]:
synonyms = {
    'person': ['personne', 'personnes'],
    'human': ['humain', 'humaine', 'humains', 'humaines'],
    'individu': ['individu', 'individus'],
    'societe': ['société', 'sociétés', 'societé', 'societés', 'sociéte', 'sociétes', 'societe', 'societes'],
    'nature': ['nature','natures'],
    'environment': ['environnement','environnements']
    
}

print(f"Nombre de phrases total dans les avis : {len(base_sentences)}")

for key in synonyms.keys():
    has_specific_words = lambda x: has_words(x, words=synonyms[key])
    base_sentences[key] = base_sentences['sentence'].apply(has_specific_words)
    n = sum(base_sentences[key])
    print(f"Le nombre de phrases qui contient {synonyms[key]} est {n}")

Nombre de phrases total dans les avis : 53591
Le nombre de phrases qui contient ['personne', 'personnes'] est 5077
Le nombre de phrases qui contient ['humain', 'humaine', 'humains', 'humaines'] est 2484
Le nombre de phrases qui contient ['individu', 'individus'] est 658
Le nombre de phrases qui contient ['société', 'sociétés', 'societé', 'societés', 'sociéte', 'sociétes', 'societe', 'societes'] est 1409
Le nombre de phrases qui contient ['nature', 'natures'] est 651
Le nombre de phrases qui contient ['environnement', 'environnements'] est 350


## Export

In [5]:
base_sentences.to_feather("../data/intermediate/big/base_sentences.feather")

# Embedding

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

from transformers import CamembertModel, CamembertTokenizer

# Load pre-trained CamemBERT model and tokenizer
model = CamembertModel.from_pretrained('camembert-base')
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Define the list of words
word_list = ["personne", "personnes", "humain", "humains", "individu", "individus"]

# Tokenize the words
tokenized_texts = [tokenizer.tokenize(word) for word in word_list]

# Convert tokens to IDs
indexed_tokens = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_texts]

# Convert indexed tokens to PyTorch tensors
tokens_tensor = [torch.tensor([indexed_tokens[i]]) for i in range(len(indexed_tokens))]

# Get embeddings for each word
embeddings = []
with torch.no_grad():
    for tensor in tokens_tensor:
        outputs = model(tensor)
        hidden_states = outputs[0]  # Last-layer hidden states
        word_embedding = torch.mean(hidden_states, dim=1).numpy()  # Average the embeddings of all tokens
        embeddings.append(word_embedding)

# Compute pairwise cosine similarity/distance
distance_matrix = np.zeros((len(word_list), len(word_list)))
for i in range(len(embeddings)):
    for j in range(len(embeddings)):
        distance_matrix[i][j] = cosine_similarity(embeddings[i], embeddings[j])

# Perform PCA for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(np.concatenate(embeddings, axis=0))

# Create DataFrame for Seaborn plotting
df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
df['Word'] = word_list

# Plot PCA result using Seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Word', s=100)
for i in range(len(word_list)):
    plt.text(df['PC1'][i], df['PC2'][i], df['Word'][i], fontsize=12)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2D PCA Representation of BERT Embeddings')
plt.legend(loc='upper left')
plt.show()

# Print distance matrix
print("Distance Matrix:")
print(pd.DataFrame(distance_matrix, index=word_list, columns=word_list))
