In [68]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score, precision_score, make_scorer

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np
from nlpaug.augmenter.word import ContextualWordEmbsAug
import joblib
from sentence_transformers import SentenceTransformer
import re

In [94]:
# Charger le fichier CSV
df = pd.read_csv("class_2_cut.csv")  # Remplace par le nom réel

def extract_sentence(text_string):
    # Check if the input is a string before applying regex
    if not isinstance(text_string, str):
        return None # Return None for non-string inputs (like NaN)

    # Regex pour trouver 'sentence ' suivi de n'importe quel caractère (.*?)
    # jusqu'à la prochaine virgule et l'espace ', ' (qui précède généralement 'option1' ou d'autres clés)
    # ou jusqu'à la fin de la chaîne si rien ne suit.
    match = re.search(r'sentence\s(.*?)(?:,\soption1|, answer|\})$', text_string)

    if match:
        return match.group(1).strip() # .strip() pour supprimer les espaces blancs en début/fin
    return None # Retourne None si la phrase n'est pas trouvée (cas d'erreur ou format inattendu)


sentiment_mapping = {
    'hesitation, question, advice': 0,
    'comparison of performance, road to sucess': 0,
    'other': 0,
    'Failure': 0,
    'relationship, helping' : 0,
    'performance, sucess, competence' : 1,
    'nan' : 0,
    # Ajoutez d'autres sentiments si vous en avez, et mappez-les à 0 ou 1
}

df['label'] = df['sentiment'].map(sentiment_mapping)
df['text'] = df['text'].apply(extract_sentence)
df['label'].fillna(0, inplace=True)

# Option B: Si vous voulez que SEULEMENT un sentiment spécifique (par exemple 'succes') soit 1, et tout le reste 0
# Décommentez et utilisez cette ligne à la place de l'Option A si c'est votre cas
# df_original['succes'] = (df_original['sentiment'] == 'succes').astype(int)

# --- Conversion en dictionnaire du format désiré ---
data = {
    'text': df['text'].tolist(),
    'label': df['label'].tolist()
}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].fillna(0, inplace=True)


In [95]:
# Extraction des phrases positives et négatives
positive_texts = df[df['label'] == 1]['text'].tolist()  # Phrases avec "nsp"
negative_texts = df[df['label'] == 0]['text'].tolist()  # Phrases sans "nsp"

print(f"Exemples positifs : {len(positive_texts)}")
print(f"Exemples négatifs : {len(negative_texts)}")

Exemples positifs : 11
Exemples négatifs : 187


In [96]:
# Initialisation de l'augmenteur
aug = ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    device='cuda'
)

# Augmentation seulement pour la classe minoritaire (positive)
augmented_positive = []
for text in positive_texts:
    augmented_positive += aug.augment(text, n=2)  # Génère 2 variantes par phrase

# Combinaison des données
texts = positive_texts + augmented_positive + negative_texts
labels = [1]*(len(positive_texts)+len(augmented_positive)) + [0]*len(negative_texts)

print(f"Taille après augmentation : {len(texts)}")

Taille après augmentation : 220


In [97]:
# Chargement du modèle de transformation de phrases
st_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')  # Utilisation de GPU si disponible

# Conversion des textes en embeddings
X = st_model.encode(texts, show_progress_bar=True)

# Conversion des labels en array numpy
y = np.array(labels)

# Création classifieur
pipeline = make_pipeline(
    PCA(n_components=0.95),  # Réduction de dimensionnalité
    SVC(  # Classificateur à Machines à Vecteurs de Support
        class_weight='balanced',  # Compense le déséquilibre des classes
        kernel='linear',          # Type de noyau par défaut
        probability=True          # Active l'estimation des probabilités 
    ))

# Recherche d'hyperparamètres
param_grid = {
    'pca__n_components': [0.85, 0.90, 0.95, 128, 256], # pourcentage de variance ou nombre de composantes
    'svc__C': [0.1, 0.5, 1, 5, 10], #régularisation
    'svc__kernel': ['linear', 'rbf'], # séparation linéaire ou non linéaire
    'svc__gamma': ['scale', 'auto'] # ?
}

scorers = {
    'f1_score': make_scorer(f1_score),
    'precision_score': make_scorer(precision_score)
}
# Configuration de la recherche d'hyperparamètres
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), # Ajout de shuffle et random_state pour reproductibilité
    scoring=scorers,      # Maintenant un dictionnaire de métriques
    refit='f1_score',     # Indique quelle métrique doit être utilisée pour sélectionner le meilleur modèle final
    n_jobs=-1,
    return_train_score=True # Pour voir les scores d'entraînement également si désiré
)


# Entraînement du modèle avec recherche d'hyperparamètres
grid.fit(X, y)

The following layers were not sharded: encoder.layer.*.attention.attn.o.bias, encoder.relative_attention_bias.weight, pooler.dense.bias, encoder.layer.*.output.LayerNorm.bias, encoder.layer.*.attention.attn.q.weight, embeddings.word_embeddings.weight, encoder.layer.*.attention.attn.q.bias, encoder.layer.*.attention.attn.k.weight, encoder.layer.*.attention.LayerNorm.weight, encoder.layer.*.output.dense.bias, embeddings.LayerNorm.weight, encoder.layer.*.intermediate.dense.weight, embeddings.LayerNorm.bias, encoder.layer.*.attention.attn.o.weight, encoder.layer.*.attention.attn.v.weight, encoder.layer.*.intermediate.dense.bias, encoder.layer.*.attention.attn.v.bias, embeddings.position_embeddings.weight, encoder.layer.*.output.dense.weight, encoder.layer.*.attention.LayerNorm.bias, encoder.layer.*.attention.attn.k.bias, pooler.dense.weight, encoder.layer.*.output.LayerNorm.weight
Batches: 100%|██████████| 7/7 [00:00<00:00, 18.46it/s]
100 fits failed out of a total of 500.
The score on the

0,1,2
,estimator,Pipeline(step...ility=True))])
,param_grid,"{'pca__n_components': [0.85, 0.9, ...], 'svc__C': [0.1, 0.5, ...], 'svc__gamma': ['scale', 'auto'], 'svc__kernel': ['linear', 'rbf']}"
,scoring,"{'f1_score': make_scorer(f...hod='predict'), 'precision_score': make_scorer(p...hod='predict')}"
,n_jobs,-1
,refit,'f1_score'
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,n_components,0.85
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [98]:
joblib.dump(grid.best_estimator_, 'classifier_V2_performance.pkl')

['classifier_V2_performance.pkl']

In [100]:
df_p = pd.DataFrame(grid.cv_results_)
df_p.to_csv("param_classifier_V2_performance.csv")

In [None]:
# 4. Test du modèle
def predict_with_confidence(text, model_path='classifier_V2_.pkl'):
    # Charger le modèle et générer l'embedding
    model = joblib.load(model_path)
    embedding = st_model.encode([text])
    
    # Récupérer les probabilités
    probas = model.predict_proba(embedding)[0]
    
    return {
        
        'classe': "relationship" if probas[1] > 0.75 else "other",
        'certitude': f"{probas[1]*100:.2f}%",
    }
# Test avec des exemples
test_phrases = [
    "He never comes to my home, but I always go to his house because the _ is ",
    "I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.",
    "Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.",
    "William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.",
    "Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.",
    "Jean really liked the profiteroles but not the peas because the _ were very sweet.",
    "Katrina had an obsession with donuts and cake but not Sarah so _ was a heavy weight.",
    "William loved having fresh eggs for breakfast every morning but Brett hated eggs. _ bought a chicken to raise for eggs.",
    "While making her breakfast, Sam sweetened her oatmeal with honey. The _ was sweet and sticky.",
    "Adam placed the fruit into the bag instead of the box, as the fruit fit well into the _ ."

]

for phrase in test_phrases:
    print(f"Phrase: {phrase}")
    print(f"Résultat: {predict_with_confidence(phrase)}\n")

Phrase: He never comes to my home, but I always go to his house because the _ is 
Résultat: {'classe': 'other', 'certitude': '6.88%'}

Phrase: I picked up a bag of peanuts and raisins for a snack. I wanted a sweeter snack out so I ate the _ for now.
Résultat: {'classe': 'other', 'certitude': '1.76%'}

Phrase: Ben had to either stop eating chocolates or nuts. He chose to stop eating the _ because they were sweet.
Résultat: {'classe': 'other', 'certitude': '2.87%'}

Phrase: William wanted an egg for breakfast, but Kevin ate the last one, so _ had to eat cereal.
Résultat: {'classe': 'other', 'certitude': '0.81%'}

Phrase: Since _ arrived at work sooner Dennis would eat their lunch before Adam in the break room at their job.
Résultat: {'classe': 'other', 'certitude': '3.50%'}

Phrase: Jean really liked the profiteroles but not the peas because the _ were very sweet.
Résultat: {'classe': 'other', 'certitude': '18.96%'}

Phrase: Katrina had an obsession with donuts and cake but not Sarah so 

In [101]:
df_pred_comp = pd.read_json(r"C:\Users\User\Documents\GitHub\Wino-UROP\winogrande_1.1\train_xl.jsonl", 
                      lines=True)


In [102]:
import joblib
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch

classe = 'performance'

# 1. Charger le modèle Sentence Transformer et s'assurer qu'il utilise le GPU
# Spécifiez 'cuda' comme device si un GPU est disponible.
# Si vous avez plusieurs GPUs, vous pouvez spécifier un index comme 'cuda:0', 'cuda:1', etc.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Le modèle Sentence Transformer utilisera : {device}")


# 2. Charger votre classifieur pré-entraîné (celui-ci restera sur CPU)
model = joblib.load('classifier_V2_performance.pkl')

# Votre DataFrame avec les phrases à prédire
# Exemple:
# data = {'sentence': ["Ceci est un plat délicieux", "J'aime regarder des films", "Le dessert était succulent", "Le climat est doux"]}
# df_pred_comp = pd.DataFrame(data)

# --- Processus optimisé pour le GPU ---

# 3. Encoder toutes les phrases en une seule fois (ou par lots si le DataFrame est immense)
# Cela enverra toutes les phrases au GPU pour l'encodage.
print("Encodage des phrases avec le modèle Sentence Transformer (sur GPU si disponible)...")
embeddings = st_model.encode(df_pred_comp['sentence'].tolist(), convert_to_tensor=True, show_progress_bar=True)
# Si vous utilisez convert_to_tensor=True, les embeddings seront déjà des tenseurs PyTorch sur le bon device.
# Si votre classifieur scikit-learn attend un tableau NumPy, vous devrez le reconvertir:
# embeddings_np = embeddings.cpu().numpy() # Convertit en NumPy et le déplace du GPU vers le CPU si nécessaire

print("Encodage terminé. Début de la prédiction du classifieur...")

# 4. Effectuer la prédiction avec le classifieur
# Le classifieur (probablement scikit-learn) travaillera sur le CPU
proba = model.predict_proba(embeddings.cpu().numpy())[:, 1] # Assurez-vous que les embeddings sont sur CPU et en NumPy

print("Prédiction du classifieur terminée. Détermination des classes...")

# 5. Appliquer la logique de détermination de classe
# Vectorisation de la logique pour éviter un .apply() Python-lourd sur les probabilités
def determine_class(p):
    if p >= 0.75:
        return classe
    elif p >= 0.25:
        return "incertitude"
    else:
        return "autre"

# Appliquer la fonction vectorisée aux probabilités
df_pred_comp['probabilite_nourriture'] = proba
df_pred_comp['classe'] = df_pred_comp['probabilite_nourriture'].apply(determine_class)

print("Processus terminé. Votre DataFrame est mis à jour.")
print(df_pred_comp.head())

Le modèle Sentence Transformer utilisera : cuda
Encodage des phrases avec le modèle Sentence Transformer (sur GPU si disponible)...


Batches: 100%|██████████| 1263/1263 [00:17<00:00, 70.95it/s]


Encodage terminé. Début de la prédiction du classifieur...
Prédiction du classifieur terminée. Détermination des classes...
Processus terminé. Votre DataFrame est mis à jour.
                                qID  \
0  3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2   
1  3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1   
2  3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1   
3  3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2   
4  3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2   

                                            sentence option1 option2  answer  \
0  Ian volunteered to eat Dennis's menudo after a...     Ian  Dennis       2   
1  Ian volunteered to eat Dennis's menudo after a...     Ian  Dennis       1   
2  He never comes to my home, but I always go to ...    home   house       1   
3  He never comes to my home, but I always go to ...    home   house       2   
4  Kyle doesn't wear leg warmers to bed, while Lo...    Kyle   Logan       2   

   probabilite_nourriture classe  
0                0.005383  autre  
1                0.012649  autre  
2   

In [90]:
df_pred_comp

Unnamed: 0,qID,sentence,option1,option2,answer,probabilite_nourriture,classe
0,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,2,0.054461,autre
1,3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-1,Ian volunteered to eat Dennis's menudo after a...,Ian,Dennis,1,0.042823,autre
2,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-1,"He never comes to my home, but I always go to ...",home,house,1,0.069884,autre
3,3XWUWJ18TLO2DDRXF83QWLKRJ29UU4-2,"He never comes to my home, but I always go to ...",home,house,2,0.091231,autre
4,3D5G8J4N5CI2K40F4RZLF9OG2CKVTH-2,"Kyle doesn't wear leg warmers to bed, while Lo...",Kyle,Logan,2,0.007734,autre
...,...,...,...,...,...,...,...
40393,3PKJ68EHDNUOUBAJ3ASD64MQ8GSJHJ-1,I felt lucky because when the fish slipped of ...,pole,net,1,0.858046,failure
40394,3W1K7D6QSDVJX2B852X30LVRM6WZBL-1,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,1,0.734849,incertitude
40395,3W1K7D6QSDVJX2B852X30LVRM6WZBL-2,Dennis took months to write back to Kenneth's ...,Dennis,Kenneth,2,0.845434,failure
40396,3BO3NEOQM2VSJ2H6ZK9L5F8V75BAIT-1,My hair looked better in a braid than a ponyta...,braid,ponytail,1,0.065517,autre


In [103]:
df_pred_comp.to_csv("df_pred_performance.csv")