## Lecture CSV

In [99]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback

In [None]:
data_train = pd.read_csv("../datas/train_clean.csv")

data_train.head()

Unnamed: 0,keyword,location,text,labels,text_keyword_location
0,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...


---

## cardiffnlp/twitter-roberta-base

In [83]:
# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")

# Charger le modèle avec une tête de classification binaire
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base",
    num_labels=2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [84]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True)

Preprocess text

In [85]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [86]:
data_train["text"] = data_train["text"].apply(preprocess)

In [87]:
data_train.head()

Unnamed: 0,keyword,location,text,labels,text_keyword_location
0,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...


Split train / val

In [88]:
from datasets import Dataset

# Conversion pandas -> Hugging Face Dataset
hf_dataset_train = Dataset.from_pandas(data_train)

# Split 80% train, 20% test (ou val)
split_dataset = hf_dataset_train.train_test_split(test_size=0.2)

# Accès aux sous-datasets
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# Appliquer la fonction de tokenization avec batching
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Après tokenization, préciser les colonnes à utiliser et convertir en PyTorch tensors
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [90]:
# Auto Padding : toutes les séquences sont de la même longueur
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [91]:
# On vérifie la taille des séquences
samples = tokenized_train_dataset[:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[70, 70, 70, 70, 70, 70, 70, 70]

---

## Fine Tuning

In [93]:
# paramètres d'entrainement

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    report_to="none"
)

In [95]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [97]:
labels = data_train["labels"].values
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [98]:
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, labels):
        ce_loss = torch.nn.functional.cross_entropy(
            logits, labels, weight=self.alpha, reduction="none"
        )
        pt = torch.exp(-ce_loss)  # probabilité correcte
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean()

class FocalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = FocalLoss(alpha=class_weights.to(model.device), gamma=2)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [100]:
trainer = FocalTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

---

## Train

In [101]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.106409,0.799737,0.778182,0.73895,0.821813
2,0.124000,0.106135,0.83388,0.793132,0.847902,0.745008
3,0.088900,0.136046,0.812869,0.785875,0.769118,0.803379
4,0.060500,0.198625,0.80696,0.763285,0.80203,0.728111


TrainOutput(global_step=1524, training_loss=0.09054075826810101, metrics={'train_runtime': 538.8068, 'train_samples_per_second': 90.422, 'train_steps_per_second': 5.657, 'total_flos': 874137148110720.0, 'train_loss': 0.09054075826810101, 'epoch': 4.0})

---

## Predictions

In [102]:
predictions_val = trainer.predict(tokenized_val_dataset)

In [103]:
preds_val = np.argmax(predictions_val.predictions, axis=-1)

---

## Classification report

In [104]:
print(classification_report(tokenized_val_dataset["labels"], preds_val))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       872
           1       0.85      0.75      0.79       651

    accuracy                           0.83      1523
   macro avg       0.84      0.82      0.83      1523
weighted avg       0.84      0.83      0.83      1523



---

## Confusion Matrix

In [105]:
# produce the confusion matrix for your predictions, what comments can you make ?
mat = confusion_matrix(tokenized_val_dataset["labels"], preds_val)

labels = data_train["labels"].unique()
df_mat = pd.DataFrame(mat, index=labels, columns=labels)

fig = px.imshow(df_mat, text_auto=True, color_continuous_scale=px.colors.sequential.Aggrnyl,
    labels=dict(x="Prédiction", y="Réalité", color="Nombre"))
fig.update_coloraxes(showscale=False)
fig.show()

---

## Submission


In [None]:
data_test = pd.read_csv("../datas/test.csv")

data_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [107]:
# Keyword
data_test["keyword"] = data_test["keyword"].fillna("")

# Location
data_test["location"] = data_test["location"].str.split(",", n=1).str[0]
data_test["location"] = data_test["location"].fillna("")

# Concaténation
data_test["text_keyword_location"] = data_test["text"] + " " + data_test["keyword"] + " " + data_test["location"]

# Preprocess
data_test["text"] = data_test["text"].apply(preprocess)

In [108]:
data_test.head()

Unnamed: 0,id,keyword,location,text,text_keyword_location
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan


In [109]:
# Conversion pandas -> Hugging Face Dataset
hf_dataset_test = Dataset.from_pandas(data_test)

# Appliquer la fonction de tokenization avec batching
tokenized_test_dataset = hf_dataset_test.map(tokenize_function, batched=True)

# Après tokenization, préciser les colonnes à utiliser et convertir en PyTorch tensors
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [110]:
predictions_test = trainer.predict(tokenized_test_dataset)

In [111]:
preds_test = np.argmax(predictions_test.predictions, axis=-1)

In [None]:
submission = pd.DataFrame({
    "id": data_test["id"],
    "target": preds_test
})

# Sauvegarde en CSV
submission.to_csv("../datas/submission_02.csv", index=False)