In [1]:
# !pip install transformers datasets scikit-learn dill

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import re

### Chargement & nettoyage du texte + concaténation avec keyword

In [2]:
# Chargement
data = pd.read_csv(r'C:\Users\HP\Desktop\ISEP2\Semestre2\Machine Learning\Projet\Codes\Notebooks\tweets.csv')  


In [3]:
data

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
...,...,...,...,...,...
11365,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0
11366,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0
11367,11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0
11368,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0


In [4]:

# Nettoyage
def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"#", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"\d+", '', text)
    text = re.sub(r"\s+", ' ', text).strip()
    return text

data["clean_text"] = data["text"].apply(clean_tweet)
data["keyword"] = data["keyword"].fillna("unknown").str.lower()

# Fusion keyword + texte avec un séparateur [SEP]
data["bert_input"] = data["keyword"] + " " + data["clean_text"]


In [5]:
data

Unnamed: 0,id,keyword,location,text,target,clean_text,bert_input
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1,communal violence in bhainsa telangana stones ...,ablaze communal violence in bhainsa telangana ...
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1,telangana section has been imposed in bhainsa ...,ablaze telangana section has been imposed in b...
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1,arsonist sets cars ablaze at dealership,ablaze arsonist sets cars ablaze at dealership
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1,arsonist sets cars ablaze at dealership,ablaze arsonist sets cars ablaze at dealership
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0,lord jesus your love brings freedom and pardon...,ablaze lord jesus your love brings freedom and...
...,...,...,...,...,...,...,...
11365,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0,media should have warned us well in advance th...,wrecked media should have warned us well in ad...
11366,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0,i feel directly attacked i consider moonbin am...,wrecked i feel directly attacked i consider mo...
11367,11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0,i feel directly attacked i consider moonbin am...,wrecked i feel directly attacked i consider mo...
11368,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0,ok who remember outcast nd the dora au those a...,wrecked ok who remember outcast nd the dora au...


### Tokenisation et préparation du dataset

In [10]:
#pip install ipywidgets

In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["clean_text"].tolist(), 
    data["target"].tolist(), 
    test_size=0.2, 
    stratify=data["target"],
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=64, return_tensors="pt")

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)


### Création du Dataset PyTorch

In [8]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)


 ### Initialisation du modèle BERT

In [11]:
#!pip install huggingface_hub[hf_xet]

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Entraînement du modèle

In [12]:
optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(3):  # augmente si besoin
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"✅ Epoch {epoch + 1} — Loss moyenne : {total_loss / len(train_loader):.4f}")


✅ Epoch 1 — Loss moyenne : 0.2766
✅ Epoch 2 — Loss moyenne : 0.1549
✅ Epoch 3 — Loss moyenne : 0.0681


### Évaluation : AUC + toutes les métriques

In [13]:
model.eval()
all_preds = []
all_probs = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        probs = torch.softmax(logits, dim=1)[:, 1]
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
prec = precision_score(all_labels, all_preds)
rec = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
auc = roc_auc_score(all_labels, all_probs)

print("\n📈 Résultats :")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"F1-score  : {f1:.4f}")
print(f"AUC       : {auc:.4f}")



📈 Résultats :
Accuracy  : 0.9063
Precision : 0.7134
Recall    : 0.8298
F1-score  : 0.7672
AUC       : 0.9431


### Sauvegarde du modèle avec dill

In [17]:
import dill
with open("bert_model.dill", "wb") as f:
    dill.dump(model, f)

In [18]:
from pathlib import Path
import dill

# Définir le répertoire et le nom du fichier
MODEL_DIR = r'C:\Users\HP\Desktop\ISEP2\Semestre2\Machine Learning\Projet\Models'
model_path_name = Path(MODEL_DIR, "bert_full_model_optimized.dill")

# Vérifier que le dossier existe, sinon le créer
Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)

# Créer un objet qui contient les deux (modèle + tokenizer)
bert_complete_model = {
    "model": model,
    "tokenizer": tokenizer
}

# Sauvegarde dans un seul fichier
print(f" Sauvegarde du modèle complet (BERT + Tokenizer) dans : {model_path_name}")
with open(model_path_name, "wb") as fp:
    dill.dump(bert_complete_model, fp)

print(f" Modèle complet sauvegardé avec succès !")


 Sauvegarde du modèle complet (BERT + Tokenizer) dans : C:\Users\HP\Desktop\ISEP2\Semestre2\Machine Learning\Projet\Models\bert_full_model_optimized.dill
 Modèle complet sauvegardé avec succès !
