In [1]:
import pandas as pd
import re
import emoji

# Loading CSV data in DataFrame
df = pd.read_csv('../../jigsaw-toxic-comment-classification-challenge/train.csv/train.csv', encoding='UTF-8', on_bad_lines='warn')

# Show the first 5 rows of the DataFrame to check
df.head(5)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
# V√©rifier les valeurs manquantes et doublons
print(df.isnull().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [3]:
print("Nombre de doublons :", df.duplicated(subset=['comment_text']).sum())

Nombre de doublons : 0


In [4]:
label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
print(df[label_cols].sum().sort_values(ascending=False))

toxic            15294
obscene           8449
insult            7877
severe_toxic      1595
identity_hate     1405
threat             478
dtype: int64


In [5]:
# Fonction de nettoyage avanc√©e
def clean_text_for_transformer(text):
    if not isinstance(text, str):
        return ""
    
    # Normaliser les apostrophes et guillemets
    text = text.replace("‚Äô", "'").replace("‚Äú", '"').replace("‚Äù", '"').replace("`", "'")
    
    # Extraire les mots contenus dans les URLs
    #    Exemple: "https://grostroudu***.com" -> "grostroudu*** com"
    def url_to_words(match):
        # match.group(1) = domaine de l'URL
        return match.group(1).replace('.', ' ')
    
    # Remplacer les URLs par les mots qu'elles contiennent
    text = re.sub(r"http[s]?://([A-Za-z0-9.-]+)", url_to_words, text)
        
    # Transformer les √©mojis en texte
    text = emoji.demojize(text)  # üò°-> "angry_face"
    
    # Supprimer ponctuation inutile (garder lettres, chiffres, espaces et apostrophes)
    text = re.sub(r"[^a-zA-Z0-9\s':]", " ", text)
    
    # Remplacer les multiples espaces par un espace unique
    text = re.sub(r"\s+", " ", text).strip()
    
    # Tout passer en minuscules
    text = text.lower()
    
    return text

In [None]:

#Appliquer le nettoyage 
df['clean_text'] = df['comment_text'].apply(clean_text_for_transformer)
print(df[['comment_text','clean_text']].head(10))

                                        comment_text  \
0  Explanation\nWhy the edits made under my usern...   
1  D'aww! He matches this background colour I'm s...   
2  Hey man, I'm really not trying to edit war. It...   
3  "\nMore\nI can't make any real suggestions on ...   
4  You, sir, are my hero. Any chance you remember...   
5  "\n\nCongratulations from me as well, use the ...   
6       COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
7  Your vandalism to the Matt Shirvington article...   
8  Sorry if the word 'nonsense' was offensive to ...   
9  alignment on this subject and which are contra...   

                                          clean_text  
0  explanation why the edits made under my userna...  
1  d'aww he matches this background colour i'm se...  
2  hey man i'm really not trying to edit war it's...  
3  more i can't make any real suggestions on impr...  
4  you sir are my hero any chance you remember wh...  
5  congratulations from me as well use the tools ... 

In [8]:
df.head(5)
from datasets import Dataset

# On prend seulement le texte nettoy√© et les labels
train_dataset = Dataset.from_pandas(df[['clean_text'] + label_cols])

In [25]:
import torch
# Cr√©er la colonne 'labels' pour le multi-label
# Convertir les colonnes de labels en tensor float
def add_labels(example):
    example['labels'] = torch.tensor([example[c] for c in label_cols], dtype=torch.float)
    return example

train_dataset = train_dataset.map(add_labels)

# Tokenization
def tokenize_fn(batch):
    return tokenizer(batch['clean_text'], truncation=True, padding='max_length', max_length=100)

train_dataset = train_dataset.map(tokenize_fn, batched=True)

# On met au format PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/159571 [00:00<?, ? examples/s]

Map:   0%|          | 0/159571 [00:00<?, ? examples/s]

In [26]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6,
    problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    learning_rate=2e-5,
    logging_steps=100,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset
)

trainer.train()

Step,Training Loss
100,0.0259
200,0.0345
300,0.0259
400,0.0287
500,0.0341
600,0.0313
700,0.0352
800,0.0276
900,0.0292
1000,0.0316


KeyboardInterrupt: 