In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np
from nlpaug.augmenter.word import ContextualWordEmbsAug
import joblib
from sentence_transformers import SentenceTransformer
import re

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Charger le fichier CSV
df = pd.read_csv("class_2_cut.csv")  # Remplace par le nom réel

def extract_sentence(text_string):
    # Check if the input is a string before applying regex
    if not isinstance(text_string, str):
        return None # Return None for non-string inputs (like NaN)

    # Regex pour trouver 'sentence ' suivi de n'importe quel caractère (.*?)
    # jusqu'à la prochaine virgule et l'espace ', ' (qui précède généralement 'option1' ou d'autres clés)
    # ou jusqu'à la fin de la chaîne si rien ne suit.
    match = re.search(r'sentence\s(.*?)(?:,\soption1|, answer|\})$', text_string)

    if match:
        return match.group(1).strip() # .strip() pour supprimer les espaces blancs en début/fin
    return None # Retourne None si la phrase n'est pas trouvée (cas d'erreur ou format inattendu)


sentiment_mapping = {
    'hesitation, question, advice': 0,
    'comparison of performance, road to sucess': 1,
    'other': 0,
    'Failure': 0,
    'relationship, helping' : 0,
    'performance, sucess, competence' : 0,
    'nan' : 0,
    # Ajoutez d'autres sentiments si vous en avez, et mappez-les à 0 ou 1
}

df['label'] = df['sentiment'].map(sentiment_mapping)
df['text'] = df['text'].apply(extract_sentence)
df['label'].fillna(0, inplace=True)

# Option B: Si vous voulez que SEULEMENT un sentiment spécifique (par exemple 'succes') soit 1, et tout le reste 0
# Décommentez et utilisez cette ligne à la place de l'Option A si c'est votre cas
# df_original['succes'] = (df_original['sentiment'] == 'succes').astype(int)

# --- Conversion en dictionnaire du format désiré ---
data = {
    'text': df['text'].tolist(),
    'label': df['label'].tolist()
}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].fillna(0, inplace=True)


In [13]:
# Extraction des phrases positives et négatives
positive_texts = df[df['label'] == 1]['text'].tolist()  # Phrases avec "nsp"
negative_texts = df[df['label'] == 0]['text'].tolist()  # Phrases sans "nsp"

print(f"Exemples positifs : {len(positive_texts)}")
print(f"Exemples négatifs : {len(negative_texts)}")

Exemples positifs : 41
Exemples négatifs : 157


In [14]:
# Initialisation de l'augmenteur
aug = ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    device='cuda'
)

# Augmentation seulement pour la classe minoritaire (positive)
augmented_positive = []
for text in positive_texts:
    augmented_positive += aug.augment(text, n=2)  # Génère 2 variantes par phrase

# Combinaison des données
texts = positive_texts + augmented_positive + negative_texts
labels = [1]*(len(positive_texts)+len(augmented_positive)) + [0]*len(negative_texts)

print(f"Taille après augmentation : {len(texts)}")

Taille après augmentation : 280


In [26]:
# Chargement du modèle de transformation de phrases
st_model = SentenceTransformer('all-mpnet-base-v2')

# Conversion des textes en embeddings
X = st_model.encode(texts, show_progress_bar=True)

# Conversion des labels en array numpy
y = np.array(labels)

# Création classifieur
pipeline = make_pipeline(
    PCA(n_components=0.95),  # Réduction de dimensionnalité
    SVC(  # Classificateur à Machines à Vecteurs de Support
        class_weight='balanced',  # Compense le déséquilibre des classes
        kernel='linear',          # Type de noyau par défaut
        probability=True          # Active l'estimation des probabilités 
    ))

# Recherche d'hyperparamètres
param_grid = {
    'pca__n_components': [0.85, 0.90, 0.95, 128, 256], # pourcentage de variance ou nombre de composantes
    'svc__C': [0.1, 0.5, 1, 5, 10], #régularisation
    'svc__kernel': ['linear', 'rbf'], # séparation linéaire ou non linéaire
    'svc__gamma': ['scale', 'auto'] # ?
}

# Configuration de la recherche d'hyperparamètres
grid = GridSearchCV(
    pipeline,         # Pipeline à optimiser
    param_grid,       # Grille de paramètres à explorer
    cv=StratifiedKFold(5),  # Validation croisée stratifiée
    scoring='precision',     # Métrique d'évaluation (F1-score)
    n_jobs=-1         # Utilisation de tous les coeurs CPU
)

# Entraînement du modèle avec recherche d'hyperparamètres
grid.fit(X, y)

The following layers were not sharded: encoder.layer.*.attention.attn.o.bias, encoder.relative_attention_bias.weight, pooler.dense.bias, encoder.layer.*.output.LayerNorm.bias, encoder.layer.*.attention.attn.q.weight, embeddings.word_embeddings.weight, encoder.layer.*.attention.attn.q.bias, encoder.layer.*.attention.attn.k.weight, encoder.layer.*.attention.LayerNorm.weight, encoder.layer.*.output.dense.bias, embeddings.LayerNorm.weight, encoder.layer.*.intermediate.dense.weight, embeddings.LayerNorm.bias, encoder.layer.*.attention.attn.o.weight, encoder.layer.*.attention.attn.v.weight, encoder.layer.*.intermediate.dense.bias, encoder.layer.*.attention.attn.v.bias, embeddings.position_embeddings.weight, encoder.layer.*.output.dense.weight, encoder.layer.*.attention.LayerNorm.bias, encoder.layer.*.attention.attn.k.bias, pooler.dense.weight, encoder.layer.*.output.LayerNorm.weight
Batches: 100%|██████████| 9/9 [00:00<00:00, 20.99it/s]
100 fits failed out of a total of 500.
The score on the

0,1,2
,estimator,Pipeline(step...ility=True))])
,param_grid,"{'pca__n_components': [0.85, 0.9, ...], 'svc__C': [0.1, 0.5, ...], 'svc__gamma': ['scale', 'auto'], 'svc__kernel': ['linear', 'rbf']}"
,scoring,'precision'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,C,5
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [19]:
joblib.dump(grid.best_estimator_, 'classifier_V2_comparison.pkl')

['classifier_V2_comparison.pkl']

In [21]:
df_p = pd.DataFrame(grid.cv_results_)
df_p.to_csv("param_classifier_V2_comparison.csv")

In [25]:
# 4. Test du modèle
def predict_with_confidence(text, model_path='classifier_V2_comparison.pkl'):
    # Charger le modèle et générer l'embedding
    model = joblib.load(model_path)
    embedding = st_model.encode([text])
    
    # Récupérer les probabilités
    probas = model.predict_proba(embedding)[0]
    
    return {
        
        'classe': "comparison" if probas[1] > 0.75 else "other",
        'certitude': f"{probas[1]*100:.2f}%",
    }
# Test avec des exemples
test_phrases = [
    "He never comes to my home, but I always go to his house because the _ is ",
    "I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.",
    "Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.",
    "William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.",
    "Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.",
    "Jean really liked the profiteroles but not the peas because the _ were very sweet.",
    "Katrina had an obsession with donuts and cake but not Sarah so _ was a heavy weight.",
    "William loved having fresh eggs for breakfast every morning but Brett hated eggs. _ bought a chicken to raise for eggs.",
    "While making her breakfast, Sam sweetened her oatmeal with honey. The _ was sweet and sticky.",
    "Adam placed the fruit into the bag instead of the box, as the fruit fit well into the _ ."

]

for phrase in test_phrases:
    print(f"Phrase: {phrase}")
    print(f"Résultat: {predict_with_confidence(phrase)}\n")

Phrase: He never comes to my home, but I always go to his house because the _ is 
Résultat: {'classe': 'comparison', 'certitude': '100.00%'}

Phrase: I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.
Résultat: {'classe': 'other', 'certitude': '0.35%'}

Phrase: Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.
Résultat: {'classe': 'other', 'certitude': '22.66%'}

Phrase: William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.
Résultat: {'classe': 'other', 'certitude': '8.52%'}

Phrase: Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.
Résultat: {'classe': 'other', 'certitude': '60.52%'}

Phrase: Jean really liked the profiteroles but not the peas because the _ were very sweet.
Résultat: {'classe': 'other', 'certitude': '16.63%'}

Phrase: Katrina had an obsession with donuts and cake but not 

In [11]:
df_gros = pd.read_json("C:/Users/valde/Documents/GitHub/Wino-UROP/train_xl.jsonl", 
                      lines=True)

df_gros = df_gros
df_gros

Unnamed: 0,qID,sentence,option1,option2,answer
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,2
1,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1
2,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1,"He never comes to my home, but I always go to ...",home,house,1
3,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2,"He never comes to my home, but I always go to ...",home,house,2
4,3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2,"Kyle doesn't wear leg warmers to bed, while Lo...",Kyle,Logan,2
...,...,...,...,...,...
40393,3PKJ68EHDNUOUBAJ3ASD64MQ8GSJHJ-1,I felt lucky because when the fish slipped of ...,pole,net,1
40394,3W1K7D6QSDVJX2B852X30LVRM6WZBL-1,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,1
40395,3W1K7D6QSDVJX2B852X30LVRM6WZBL-2,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,2
40396,3BO3NEOQM2VSJ2H6ZK9L5F8V75BAIT-1,My hair looked better in a braid than a ponyta...,braid,ponytail,1


In [12]:
model = joblib.load('nourriture_classifier.pkl')
def predict_with_confidence(text):
    """Prédit la classe avec niveaux de certitude"""
    embedding = st_model.encode([text])
    proba = model.predict_proba(embedding)[0][1]  # Probabilité nourriture
    
    # Détermination de la classe
    if proba >= 0.75:
        classe = "nourriture"
    elif proba >= 0.25:
        classe = "incertitude"
    else:
        classe = "autre"
    
    return pd.Series({
        'classe': classe,
        'probabilite_nourriture': proba
    })

# Ajout des colonnes résultats
df_gros[['classe', 'probabilite_nourriture']] = df_gros['sentence'].apply(predict_with_confidence)

KeyboardInterrupt: 

In [None]:
df_gros

Unnamed: 0,qID,sentence,option1,option2,answer,classe,probabilite_nourriture
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,2,autre,0.013736
1,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1,autre,0.011170
2,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1,"He never comes to my home, but I always go to ...",home,house,1,autre,0.023447
3,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2,"He never comes to my home, but I always go to ...",home,house,2,autre,0.019522
4,3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2,"Kyle doesn't wear leg warmers to bed, while Lo...",Kyle,Logan,2,autre,0.005026
...,...,...,...,...,...,...,...
40393,3PKJ68EHDNUOUBAJ3ASD64MQ8GSJHJ-1,I felt lucky because when the fish slipped of ...,pole,net,1,autre,0.006103
40394,3W1K7D6QSDVJX2B852X30LVRM6WZBL-1,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,1,autre,0.005914
40395,3W1K7D6QSDVJX2B852X30LVRM6WZBL-2,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,2,autre,0.016394
40396,3BO3NEOQM2VSJ2H6ZK9L5F8V75BAIT-1,My hair looked better in a braid than a ponyta...,braid,ponytail,1,autre,0.000490


In [None]:
df_gros.to_csv("classe_nourriture_entier.csv")