In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np
from nlpaug.augmenter.word import ContextualWordEmbsAug
import joblib
from sentence_transformers import SentenceTransformer




In [2]:
df = pd.read_csv("phrases_annotees.csv")

In [3]:
df = df[df['label'] != -1]
df = df.rename(columns={'phrase': 'sentence'})


In [4]:
df

Unnamed: 0,qID,sentence,option1,option2,answer,label
3,3Y3CZJSZ9KRTC691AD0DJY5RDO45RP-1,Sue traditionally donated old clothes to her f...,checks,clothes,1,0
5,3CMIQF80GNOPJ9UF2HLI6E9M8HA6Q1-1,Mary had a better body and was more of a showo...,Mary,Emily,1,1
11,3UZUVSO3P7T2B9P2G6XZ1THUFBBEMB-1,Samantha was generally faster at running than ...,Samantha,Jessica,1,1
12,3KVQ0UJWPXJYK2QN97AJ6XD856EW5K-1,William showed Randy how to replace the paneli...,William,Randy,1,1
16,3PGQRAZX04YI9YI5K2HFC3WPJUFYSI-1,The device worked better than the phone becaus...,phone,device,1,0
...,...,...,...,...,...,...
1189,3HXCEECSQMR0GL5CVWZNGY5JCT8ZY6-1,Nick watched tapes of Jeffrey to get better at...,Nick,Jeffrey,1,1
1192,3E24UO25Q141SMG6725E972VO12O67-1,Victoria spent long hours at the library doing...,Victoria,Erin,1,0
1202,38XPGNCKHVEI3JLPPQYD71CPO1CV4A-1,Francine wanted to wears socks with her skirt ...,stockings,socks,1,0
1204,3QTFNPMJC8WLYB058S2CRGVK16WZNE-1,We did the dishes in the dishwasher instead of...,dishwasher,sink,1,0


In [5]:
# Extraction des phrases positives et négatives
positive_texts = df[df['label'] == 1]['sentence'].tolist()  # Phrases avec "compétence"
negative_texts = df[df['label'] == 0]['sentence'].tolist()  # Phrases sans "compétence"

print(f"Exemples positifs : {len(positive_texts)}")
print(f"Exemples négatifs : {len(negative_texts)}")

Exemples positifs : 104
Exemples négatifs : 117


In [6]:
# Initialisation de l'augmenteur
aug = ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    device='cuda'
)

# Augmentation seulement pour la classe minoritaire (positive)
augmented_positive = []
for text in positive_texts:
    augmented_positive += aug.augment(text, n=2)  # Génère 2 variantes par phrase

# Combinaison des données
texts = positive_texts + augmented_positive + negative_texts
labels = [1]*(len(positive_texts)+len(augmented_positive)) + [0]*len(negative_texts)

print(f"Taille après augmentation : {len(texts)}")

Taille après augmentation : 429


In [7]:
# Chargement du modèle de transformation de phrases
st_model = SentenceTransformer('all-mpnet-base-v2')

# Conversion des textes en embeddings
X = st_model.encode(texts, show_progress_bar=True)

# Conversion des labels en array numpy
y = np.array(labels)

# Création classifieur
pipeline = make_pipeline(
    PCA(n_components=0.95),  # Réduction de dimensionnalité
    SVC(  # Classificateur à Machines à Vecteurs de Support
        class_weight='balanced',  # Compense le déséquilibre des classes
        kernel='linear',          # Type de noyau par défaut
        probability=True          # Active l'estimation des probabilités 
    ))

# Recherche d'hyperparamètres
param_grid = {
    'pca__n_components': [0.85, 0.90, 0.95, 128, 256], # pourcentage de variance ou nombre de composantes
    'svc__C': [0.1, 0.5, 1, 5, 10], #régularisation
    'svc__kernel': ['linear', 'rbf'], # séparation linéaire ou non linéaire
    'svc__gamma': ['scale', 'auto'] # ?
}

# Configuration de la recherche d'hyperparamètres
grid = GridSearchCV(
    pipeline,         # Pipeline à optimiser
    param_grid,       # Grille de paramètres à explorer
    cv=StratifiedKFold(5),  # Validation croisée stratifiée
    scoring='f1',     # Métrique d'évaluation (F1-score)
    n_jobs=-1         # Utilisation de tous les coeurs CPU
)

# Entraînement du modèle avec recherche d'hyperparamètres
grid.fit(X, y)

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

In [8]:
joblib.dump(grid.best_estimator_, 'expertise_classifier.pkl')

['expertise_classifier.pkl']

In [9]:
df_p = pd.DataFrame(grid.cv_results_)
df_p.to_csv("df_p.csv")

In [None]:
# 4. Test du modèle
def predict_with_confidence(text, model_path='expertise_classifier.pkl'):
    # Charger le modèle et générer l'embedding
    model = joblib.load(model_path)
    embedding = st_model.encode([text])
    
    # Récupérer les probabilités
    probas = model.predict_proba(embedding)[0]
    
    return {
        
        'classe': "expertise" if probas[1] > 0.75 else "other",
        'certitude_expertises': f"{probas[1]*100:.2f}%",
    }
# Test avec des exemples
test_phrases = [
    "Ian volunteered to eat Dennis's menudo after already having a bowl because _ enjoyed eating intestine.	",
    "I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.",
    "Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.",
    "William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.",
    "Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.",
    "Jean really liked the profiteroles but not the peas because the _ were very sweet.",
    "Katrina had an obsession with donuts and cake but not Sarah so _ was a heavy weight.",
    "William loved having fresh eggs for breakfast every morning but Brett hated eggs. _ bought a chicken to raise for eggs.",
    "While making her breakfast, Sam sweetened her oatmeal with honey. The _ was sweet and sticky.",
    "Adam placed the fruit into the bag instead  of the box, as the fruit fit well into the _ ."

]

for phrase in test_phrases:
    print(f"Phrase: {phrase}")
    print(f"Résultat: {predict_with_confidence(phrase)}\n")

Phrase: Ian volunteered to eat Dennis's menudo after already having a bowl because _ enjoyed eating intestine.	
Résultat: {'classe': 'other', 'certitude_expertises': '21.76%'}

Phrase: I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.
Résultat: {'classe': 'other', 'certitude_expertises': '0.01%'}

Phrase: Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.
Résultat: {'classe': 'other', 'certitude_expertises': '0.01%'}

Phrase: William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.
Résultat: {'classe': 'other', 'certitude_expertises': '14.25%'}

Phrase: Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.
Résultat: {'classe': 'other', 'certitude_expertises': '45.41%'}

Phrase: Jean really liked the profiteroles but not the peas because the _ were very sweet.
Résultat: {'classe': 'other', 'certit

In [11]:
df_gros = pd.read_json("C:/Users/valde/Documents/GitHub/Wino-UROP/train_xl.jsonl", 
                      lines=True)

df_gros = df_gros
df_gros

Unnamed: 0,qID,sentence,option1,option2,answer
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,2
1,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1
2,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1,"He never comes to my home, but I always go to ...",home,house,1
3,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2,"He never comes to my home, but I always go to ...",home,house,2
4,3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2,"Kyle doesn't wear leg warmers to bed, while Lo...",Kyle,Logan,2
...,...,...,...,...,...
40393,3PKJ68EHDNUOUBAJ3ASD64MQ8GSJHJ-1,I felt lucky because when the fish slipped of ...,pole,net,1
40394,3W1K7D6QSDVJX2B852X30LVRM6WZBL-1,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,1
40395,3W1K7D6QSDVJX2B852X30LVRM6WZBL-2,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,2
40396,3BO3NEOQM2VSJ2H6ZK9L5F8V75BAIT-1,My hair looked better in a braid than a ponyta...,braid,ponytail,1


In [12]:
model = joblib.load('expertise_classifier.pkl')
def predict_with_confidence(text):
    """Prédit la classe avec niveaux de certitude"""
    embedding = st_model.encode([text])
    proba = model.predict_proba(embedding)[0][1]  # Probabilité expertise
    
    # Détermination de la classe
    if proba >= 0.75:
        classe = "expertise"
    elif proba >= 0.25:
        classe = "incertitude"
    else:
        classe = "autre"
    
    return pd.Series({
        'classe': classe,
        'probabilite_expertise': proba
    })

# Ajout des colonnes résultats
df_gros[['classe', 'probabilite_expertise']] = df_gros['sentence'].apply(predict_with_confidence)

In [13]:
df_gros

Unnamed: 0,qID,sentence,option1,option2,answer,classe,probabilite_expertise
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,2,autre,0.103162
1,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1,autre,0.217559
2,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1,"He never comes to my home, but I always go to ...",home,house,1,autre,0.035881
3,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2,"He never comes to my home, but I always go to ...",home,house,2,autre,0.020670
4,3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2,"Kyle doesn't wear leg warmers to bed, while Lo...",Kyle,Logan,2,autre,0.085055
...,...,...,...,...,...,...,...
40393,3PKJ68EHDNUOUBAJ3ASD64MQ8GSJHJ-1,I felt lucky because when the fish slipped of ...,pole,net,1,incertitude,0.696617
40394,3W1K7D6QSDVJX2B852X30LVRM6WZBL-1,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,1,incertitude,0.274686
40395,3W1K7D6QSDVJX2B852X30LVRM6WZBL-2,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,2,autre,0.126779
40396,3BO3NEOQM2VSJ2H6ZK9L5F8V75BAIT-1,My hair looked better in a braid than a ponyta...,braid,ponytail,1,autre,0.001409


In [14]:
df_gros.to_csv("classe_expertise_entier.csv")