In [1]:
import pandas as pd
from datasets import Dataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, TrainingArguments, Trainer

# Exemple : chargement des données
df = pd.read_csv("glassdoor_reviews_proceeded.csv")  # Adapter le chemin
pd.set_option('display.max_columns', None)

#nom des top 10 fimrs avec le plus de commentaires
noms_firmes_top_10 = df['firm'].value_counts().head(1).index.tolist()

# Création du masque
mask = df['firm'].isin(noms_firmes_top_10)

# Filtrer le DataFrame
df_filtre = df[mask].copy()

# Fusionner le texte
df_filtre["full_text_raw_2"] = df_filtre["headline"].fillna('') + " " + df_filtre["pros"].fillna('') + " " + df_filtre["cons"].fillna('')

df_filtre = df_filtre[df_filtre['overall_rating'] != 3]  # éliminer les neutres
df_filtre['label'] = df_filtre['overall_rating'].apply(lambda x: 0 if x > 3 else 1)

# Clean data
df_clean = df_filtre.dropna(subset=['full_text_raw_2', 'label'])

#axe graph
X = df_clean["full_text_raw_2"].tolist()
y = df_clean["label"].tolist()

#Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_data = Dataset.from_dict({'text': X_train, 'label': y_train})
test_data = Dataset.from_dict({'text': X_test, 'label': y_test})


# 3. Charger modèle et tokenizer (DistilBERT déjà fine-tuné pour la classification)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )

train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtre['label'] = df_filtre['overall_rating'].apply(lambda x: 0 if x > 3 else 1)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/34680 [00:00<?, ? examples/s]

Map:   0%|          | 0/8670 [00:00<?, ? examples/s]

In [None]:
# 5. Charger le modèle
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 6. Arguments d’entraînement
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    logging_steps=10,
    logging_dir='./logs',
    report_to='none',
)

# 7. Fonction de calcul de métrique
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# 8. Trainer avec barre de progression automatique
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()  # Affiche la barre de progression avec % d’avancement

# 9. Évaluation finale
eval_results = trainer.evaluate()
print(f"Test accuracy: {eval_results['eval_accuracy']:.2f}")


  trainer = Trainer(


In [None]:
#prédiction et analyse par entreprise
from torch.utils.data import Dataset, DataLoader
print("load")

#nom des top 10 fimrs avec le plus de commentaires
noms_firmes_top_10 = df['firm'].value_counts().head(10).index.tolist()

# Création du masque
mask = df['firm'].isin(noms_firmes_top_10)

# Filtrer le DataFrame
df_filtre = df[mask].copy()

# Fusionner le texte
df_filtre["full_text_raw_2"] = df_filtre["headline"].fillna('') + " " + df_filtre["pros"].fillna('') + " " + df_filtre["cons"].fillna('')

df_filtre = df_filtre[df_filtre['overall_rating'] != 3]  # éliminer les neutres
df_filtre['label'] = df_filtre['overall_rating'].apply(lambda x: 0 if x > 3 else 1)

# Clean data
df_clean = df_filtre.dropna(subset=['full_text_raw_2', 'label'])

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx]

print("load1")

def collate_fn(batch):
    return tokenizer(batch, truncation=True, padding=True, return_tensors="pt")

dataset = TextDataset(df_clean['full_text_raw_2'].fillna('').tolist())
loader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, num_workers=2, pin_memory=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

all_predictions = []
with torch.no_grad():
    for encodings in loader:
        # Move all input tensors to the same device as the model
        encodings = {k: v.to(device) for k, v in encodings.items()}
        outputs = model(**encodings)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_predictions.extend(preds)

df_clean['predicted_sentiment'] = all_predictions
print("load2")

load


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtre['label'] = df_filtre['overall_rating'].apply(lambda x: 0 if x > 3 else 1)


load1
load2


In [None]:
df_clean["egal"] = df_clean["label"] == df_clean["predicted_sentiment"]
total_true = df_clean["egal"].sum()  # Nombre de TRUE
total_false = (~df_clean["egal"]).sum()  # Nombre de FALSE
print(total_true)
print(total_false)
print((total_true/(total_true+total_false)))


160677
17069
0.9039697095855884


In [None]:
from sklearn.metrics import classification_report

 #y_pred
#df_clean["label"] = #y_true

print(classification_report(df_clean["label"], df_clean['predicted_sentiment'], digits=4))
#y_pred_labels = np.argmax(y_pred_logits, axis=1)
#BERT

              precision    recall  f1-score   support

           0     0.9365    0.9456    0.9410    144056
           1     0.7573    0.7260    0.7413     33690

    accuracy                         0.9040    177746
   macro avg     0.8469    0.8358    0.8412    177746
weighted avg     0.9026    0.9040    0.9032    177746

