In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np
from nlpaug.augmenter.word import ContextualWordEmbsAug
import joblib
from sentence_transformers import SentenceTransformer




In [3]:
df = pd.read_csv("phrases_annotees.csv")

In [4]:
df = df[df['label'] != -1]
df = df.rename(columns={'phrase': 'sentence'})


In [5]:
df

Unnamed: 0,qID,sentence,option1,option2,answer,label
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1,0
1,3NRZ1LDP7W4677DB493437C7D3WZP2-1,I picked up a bag of peanuts and raisins for a...,raisins,peanuts,1,0
7,3MZ3TAMYTLL5OU4GHXJ7879WIBGRIT-1,Ben had to either stop eating chocolates or nu...,chocolates,nuts,1,1
9,3PIOQ99R7YKPIN7OI7R4IQSX6RPUNM-1,"William wanted an egg for breakfast, but Kevin...",William,Kevin,1,0
10,30Y6N4AHYPUOGJOUPJIWHATJQ9LDRW-1,Since _ arrived at work sooner Dennis would ea...,Dennis,Adam,1,0
...,...,...,...,...,...,...
1719,329E6HTMSYGJ5C89Q8FNVN92U0L3KH-1,Jean really liked the profiteroles but not the...,profiteroles,peas,1,0
1721,3YCT0L9OMONOXZERD084VNYHEQ4SN8-1,Katrina had an obsession with donuts and cake ...,Katrina,Sarah,1,1
1723,3SSN80MU8E2VLAFNAXMB23Y5ZM5KXV-1,William loved having fresh eggs for breakfast ...,William,Brett,1,0
1725,3V0TR1NRVCGYBVIREV7HBEYB1IDA44-1,"While making her breakfast, Sam sweetened her ...",honey,oatmeal,1,0


In [6]:
# Extraction des phrases positives et négatives
positive_texts = df[df['label'] == 1]['sentence'].tolist()  # Phrases avec "nsp"
negative_texts = df[df['label'] == 0]['sentence'].tolist()  # Phrases sans "nsp"

print(f"Exemples positifs : {len(positive_texts)}")
print(f"Exemples négatifs : {len(negative_texts)}")

Exemples positifs : 92
Exemples négatifs : 332


In [7]:
# Initialisation de l'augmenteur
aug = ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    device='cuda'
)

# Augmentation seulement pour la classe minoritaire (positive)
augmented_positive = []
for text in positive_texts:
    augmented_positive += aug.augment(text, n=2)  # Génère 2 variantes par phrase

# Combinaison des données
texts = positive_texts + augmented_positive + negative_texts
labels = [1]*(len(positive_texts)+len(augmented_positive)) + [0]*len(negative_texts)

print(f"Taille après augmentation : {len(texts)}")

The following layers were not sharded: bert.encoder.layer.*.intermediate.dense.weight, bert.embeddings.LayerNorm.weight, bert.encoder.layer.*.attention.output.LayerNorm.weight, cls.predictions.decoder.weight, bert.encoder.layer.*.intermediate.dense.bias, bert.encoder.layer.*.attention.self.key.bias, bert.encoder.layer.*.attention.output.LayerNorm.bias, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.output.LayerNorm.bias, bert.encoder.layer.*.output.dense.bias, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.output.LayerNorm.weight, cls.predictions.bias, cls.predictions.transform.dense.bias, bert.encoder.layer.*.output.dense.weight, bert.encoder.layer.*.attention.self.value.bias, cls.predictions.transform.LayerNorm.weight, cls.predictions.transform.LayerNorm.bias, bert.encoder.layer.*.attention.self.query.bias, bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.attention.self.query.weight, bert.embeddings.position_embeddings.weight, bert.enco

Taille après augmentation : 608


In [8]:
# Chargement du modèle de transformation de phrases
st_model = SentenceTransformer('all-mpnet-base-v2')

# Conversion des textes en embeddings
X = st_model.encode(texts, show_progress_bar=True)

# Conversion des labels en array numpy
y = np.array(labels)

# Création classifieur
pipeline = make_pipeline(
    PCA(n_components=0.95),  # Réduction de dimensionnalité
    SVC(  # Classificateur à Machines à Vecteurs de Support
        class_weight='balanced',  # Compense le déséquilibre des classes
        kernel='linear',          # Type de noyau par défaut
        probability=True          # Active l'estimation des probabilités 
    ))

# Recherche d'hyperparamètres
param_grid = {
    'pca__n_components': [0.85, 0.90, 0.95, 128, 256], # pourcentage de variance ou nombre de composantes
    'svc__C': [0.1, 0.5, 1, 5, 10], #régularisation
    'svc__kernel': ['linear', 'rbf'], # séparation linéaire ou non linéaire
    'svc__gamma': ['scale', 'auto'] # ?
}

# Configuration de la recherche d'hyperparamètres
grid = GridSearchCV(
    pipeline,         # Pipeline à optimiser
    param_grid,       # Grille de paramètres à explorer
    cv=StratifiedKFold(5),  # Validation croisée stratifiée
    scoring='average_precision',     # Métrique d'évaluation (F1-score)
    n_jobs=-1         # Utilisation de tous les coeurs CPU
)

# Entraînement du modèle avec recherche d'hyperparamètres
grid.fit(X, y)

The following layers were not sharded: encoder.layer.*.attention.attn.k.weight, encoder.layer.*.intermediate.dense.weight, encoder.layer.*.output.LayerNorm.weight, encoder.layer.*.attention.attn.o.bias, encoder.layer.*.attention.attn.v.bias, encoder.layer.*.attention.LayerNorm.bias, embeddings.LayerNorm.weight, encoder.layer.*.attention.attn.q.bias, encoder.layer.*.attention.LayerNorm.weight, encoder.layer.*.output.dense.weight, encoder.layer.*.output.dense.bias, embeddings.LayerNorm.bias, encoder.layer.*.attention.attn.v.weight, encoder.layer.*.attention.attn.q.weight, encoder.layer.*.intermediate.dense.bias, encoder.layer.*.attention.attn.o.weight, encoder.relative_attention_bias.weight, embeddings.position_embeddings.weight, encoder.layer.*.attention.attn.k.bias, pooler.dense.weight, encoder.layer.*.output.LayerNorm.bias, embeddings.word_embeddings.weight, pooler.dense.bias


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

In [9]:
joblib.dump(grid.best_estimator_, 'nourriture_classifier.pkl')

['nourriture_classifier.pkl']

In [10]:
df_p = pd.DataFrame(grid.cv_results_)
df_p.to_csv("df_p_precision.csv")

In [11]:
# 4. Test du modèle
def predict_with_confidence(text, model_path='nourriture_classifier.pkl'):
    # Charger le modèle et générer l'embedding
    model = joblib.load(model_path)
    embedding = st_model.encode([text])
    
    # Récupérer les probabilités
    probas = model.predict_proba(embedding)[0]
    
    return {
        
        'classe': "nourriture" if probas[1] > 0.75 else "other",
        'certitude_nourriture': f"{probas[1]*100:.2f}%",
    }
# Test avec des exemples
test_phrases = [
    "Ian volunteered to eat Dennis's menudo after already having a bowl because _ enjoyed eating intestine.	",
    "I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.",
    "Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.",
    "William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.",
    "Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.",
    "Jean really liked the profiteroles but not the peas because the _ were very sweet.",
    "Katrina had an obsession with donuts and cake but not Sarah so _ was a heavy weight.",
    "William loved having fresh eggs for breakfast every morning but Brett hated eggs. _ bought a chicken to raise for eggs.",
    "While making her breakfast, Sam sweetened her oatmeal with honey. The _ was sweet and sticky.",
    "Adam placed the fruit into the bag instead of the box, as the fruit fit well into the _ ."

]

for phrase in test_phrases:
    print(f"Phrase: {phrase}")
    print(f"Résultat: {predict_with_confidence(phrase)}\n")

Phrase: Ian volunteered to eat Dennis's menudo after already having a bowl because _ enjoyed eating intestine.	
Résultat: {'classe': 'other', 'certitude_nourriture': '1.02%'}

Phrase: I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.
Résultat: {'classe': 'other', 'certitude_nourriture': '1.02%'}

Phrase: Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.
Résultat: {'classe': 'nourriture', 'certitude_nourriture': '98.75%'}

Phrase: William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.
Résultat: {'classe': 'other', 'certitude_nourriture': '1.02%'}

Phrase: Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.
Résultat: {'classe': 'other', 'certitude_nourriture': '1.02%'}

Phrase: Jean really liked the profiteroles but not the peas because the _ were very sweet.
Résultat: {'classe': 'other', 'cer

In [12]:
df_gros = pd.read_json("C:/Users/valde/Documents/GitHub/Wino-UROP/train_xl.jsonl", 
                      lines=True)

df_gros = df_gros
df_gros

Unnamed: 0,qID,sentence,option1,option2,answer
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,2
1,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1
2,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1,"He never comes to my home, but I always go to ...",home,house,1
3,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2,"He never comes to my home, but I always go to ...",home,house,2
4,3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2,"Kyle doesn't wear leg warmers to bed, while Lo...",Kyle,Logan,2
...,...,...,...,...,...
40393,3PKJ68EHDNUOUBAJ3ASD64MQ8GSJHJ-1,I felt lucky because when the fish slipped of ...,pole,net,1
40394,3W1K7D6QSDVJX2B852X30LVRM6WZBL-1,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,1
40395,3W1K7D6QSDVJX2B852X30LVRM6WZBL-2,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,2
40396,3BO3NEOQM2VSJ2H6ZK9L5F8V75BAIT-1,My hair looked better in a braid than a ponyta...,braid,ponytail,1


In [13]:
model = joblib.load('nourriture_classifier.pkl')
def predict_with_confidence(text):
    """Prédit la classe avec niveaux de certitude"""
    embedding = st_model.encode([text])
    proba = model.predict_proba(embedding)[0][1]  # Probabilité nourriture
    
    # Détermination de la classe
    if proba >= 0.75:
        classe = "nourriture"
    elif proba >= 0.25:
        classe = "incertitude"
    else:
        classe = "autre"
    
    return pd.Series({
        'classe': classe,
        'probabilite_nourriture': proba
    })

# Ajout des colonnes résultats
df_gros[['classe', 'probabilite_nourriture']] = df_gros['sentence'].apply(predict_with_confidence)

In [14]:
df_gros

Unnamed: 0,qID,sentence,option1,option2,answer,classe,probabilite_nourriture
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,2,autre,0.010732
1,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1,autre,0.010178
2,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1,"He never comes to my home, but I always go to ...",home,house,1,autre,0.039803
3,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2,"He never comes to my home, but I always go to ...",home,house,2,autre,0.037623
4,3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2,"Kyle doesn't wear leg warmers to bed, while Lo...",Kyle,Logan,2,autre,0.002095
...,...,...,...,...,...,...,...
40393,3PKJ68EHDNUOUBAJ3ASD64MQ8GSJHJ-1,I felt lucky because when the fish slipped of ...,pole,net,1,autre,0.006138
40394,3W1K7D6QSDVJX2B852X30LVRM6WZBL-1,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,1,autre,0.008117
40395,3W1K7D6QSDVJX2B852X30LVRM6WZBL-2,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,2,autre,0.027122
40396,3BO3NEOQM2VSJ2H6ZK9L5F8V75BAIT-1,My hair looked better in a braid than a ponyta...,braid,ponytail,1,autre,0.002265


In [15]:
df_gros.to_csv("classe_nourriture_entier_precision.csv")