In [1]:
# Step 1: Fine-Tune XLM-RoBERTa and Test All Models on YouTube Comments

import torch
import pandas as pd
from transformers import (XLMRobertaTokenizer, XLMRobertaForSequenceClassification, 
                          BertForSequenceClassification, BertTokenizer,
                          RobertaForSequenceClassification, RobertaTokenizer,
                          DistilBertForSequenceClassification, DistilBertTokenizer,
                          Trainer, TrainingArguments)
from datasets import Dataset
from sklearn.model_selection import train_test_split

# --- Part 1: Fine-Tune XLM-RoBERTa ---

# Load Jigsaw dataset
data_path = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv"
df = pd.read_csv(data_path)
df = df.drop_duplicates(subset=["comment_text"])
df = df[["comment_text", "toxic"]].dropna()

# Balance classes
toxic_df = df[df["toxic"] == 1]
non_toxic_df = df[df["toxic"] == 0].sample(n=len(toxic_df), random_state=42)
balanced_df = pd.concat([toxic_df, non_toxic_df]).sample(frac=1, random_state=42)

# Split: 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(balanced_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Rename 'toxic' to 'labels' for Trainer compatibility
train_df = train_df.rename(columns={"toxic": "labels"})
val_df = val_df.rename(columns={"toxic": "labels"})
test_df = test_df.rename(columns={"toxic": "labels"})

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize with XLM-RoBERTa
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def tokenize_dataset(dataset, tokenizer, max_length=64):
    def tokenize_function(examples):
        return tokenizer(examples["comment_text"], padding="max_length", truncation=True, max_length=max_length)
    return dataset.map(tokenize_function, batched=True)

train_xlm = tokenize_dataset(train_dataset, xlm_tokenizer)
val_xlm = tokenize_dataset(val_dataset, xlm_tokenizer)
test_xlm = tokenize_dataset(test_dataset, xlm_tokenizer)

# Save tokenized datasets
train_xlm.save_to_disk("/kaggle/working/preprocessed/train_xlm")
val_xlm.save_to_disk("/kaggle/working/preprocessed/val_xlm")
test_xlm.save_to_disk("/kaggle/working/preprocessed/test_xlm")

# Fine-tune XLM-RoBERTa
xlm_model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
training_args = TrainingArguments(
    output_dir="/kaggle/working/models/xlm_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"
)

trainer = Trainer(
    model=xlm_model,
    args=training_args,
    train_dataset=train_xlm,
    eval_dataset=val_xlm,
)
trainer.train()
trainer.save_model("/kaggle/working/models/xlm_finetuned")
xlm_tokenizer.save_pretrained("/kaggle/working/models/xlm_finetuned")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Map:   0%|          | 0/21411 [00:00<?, ? examples/s]

Map:   0%|          | 0/4588 [00:00<?, ? examples/s]

Map:   0%|          | 0/4589 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/21411 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4588 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4589 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.282,0.249782
2,0.2202,0.338507
3,0.1723,0.349726


('/kaggle/working/models/xlm_finetuned/tokenizer_config.json',
 '/kaggle/working/models/xlm_finetuned/special_tokens_map.json',
 '/kaggle/working/models/xlm_finetuned/sentencepiece.bpe.model',
 '/kaggle/working/models/xlm_finetuned/added_tokens.json')

In [None]:

# Test inference on multilingual examples
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlm_model = XLMRobertaForSequenceClassification.from_pretrained("/kaggle/working/models/xlm_finetuned").to(device)

def predict_toxicity(model, tokenizer, text, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    pred = probs.argmax(-1).item()
    return pred, probs[0][1].item()

# Multilingual and emoji examples
examples = [
    {"text": "¡Eres un idiota! 😂", "label": 1},  # Spanish
    {"text": "Tu es génial! 👍", "label": 0},     # French
    {"text": "आप मूर्ख हैं! 😡", "label": 1},     # Hindi
    {"text": "Great work 😊", "label": 0},        # English with emoji
]

for example in examples:
    text, true_label = example["text"], example["label"]
    pred, prob = predict_toxicity(xlm_model, xlm_tokenizer, text, device)
    print(f"\nText: {text}")
    print(f"XLM-RoBERTa: Predicted={pred}, Toxic Prob={prob:.4f}, True={true_label}")

In [None]:
# Step 6: Test XLM-RoBERTa on Pre-Collected YouTube Comments

import torch
import pandas as pd
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer

# Load fine-tuned XLM-RoBERTa model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlm_model = XLMRobertaForSequenceClassification.from_pretrained("/kaggle/working/models/xlm_finetuned").to(device)
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("/kaggle/working/models/xlm_finetuned")

# Prediction function
def predict_toxicity(model, tokenizer, text, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    pred = probs.argmax(-1).item()
    toxic_prob = probs[0][1].item()
    return pred, toxic_prob

# Pre-collected YouTube comments (curated to reflect diversity)
youtube_comments = [
    # English without emojis/symbols
    "This video is terrible",
    "Really well made content",
    "You should quit making videos",
    "Thanks for the great upload",
    # English with emojis/symbols
    "Worst thing ever 😂👎",
    "Love this so much! 😍✨",
    "Go jump off a cliff ☠️",
    "Awesome work! 😊👍",
    # Multilingual without emojis/symbols
    "¡Qué video tan malo!",
    "Bon contenu, merci!",
    "आपका वीडियो शानदार है",
    "このビデオはひどい",
    # Multilingual with emojis/symbols
    "¡Eres un genio! ⭐",
    "Tu es nul 😡👊",
    "बहुत बेकार! 😤💢",
    "素晴らしいね 😊👍"
]

print(f"Testing {len(youtube_comments)} YouTube comments with XLM-RoBERTa")

# Test XLM-RoBERTa and store results
results = []
for comment in youtube_comments:
    true_label = -1  # Unknown, unless you annotate manually
    xlm_pred, xlm_prob = predict_toxicity(xlm_model, xlm_tokenizer, comment, device)
    
    results.append({
        "Text": comment,
        "True_Label": true_label,
        "XLM_RoBERTa_Pred": xlm_pred,
        "XLM_RoBERTa_Toxic_Prob": xlm_prob
    })

# Convert to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("/kaggle/working/youtube_comment_test_results_xlm_only.csv", index=False)
print("Results saved to /kaggle/working/youtube_comment_test_results_xlm_only.csv")
print(results_df)  # Display full results

In [3]:
import pandas as pd
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from datasets import Dataset

# Step 1: Load Excel file and convert to CSV
excel_file_path = "/kaggle/input/random-multi-youtube-comments-scrapped/Youtube comments scrapper_Testing File.xlsx"  # Replace with your actual file path
comments_df = pd.read_excel(excel_file_path)

# Ensure the column name matches your Excel file (adjust if different)
if "comment" not in comments_df.columns:
    print("Column 'comment' not found. Available columns:", comments_df.columns)
    # Rename if needed, e.g., comments_df = comments_df.rename(columns={"Comments": "comment_text"})
else:
    # Save as CSV
    csv_file_path = "/kaggle/working/comments.csv"
    comments_df[["comment"]].to_csv(csv_file_path, index=False)
    print(f"Converted Excel to CSV at {csv_file_path} with {len(comments_df)} comments.")

# Step 2: Convert to Hugging Face Dataset
comments_dataset = Dataset.from_pandas(comments_df[["comment"]])

# Step 3: Tokenize with your fine-tuned XLM-RoBERTa tokenizer
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("/kaggle/working/models/xlm_finetuned")  # Load from your fine-tuned path

def tokenize_dataset(dataset, tokenizer, max_length=64):
    def tokenize_function(examples):
        return tokenizer(examples["comment"], padding="max_length", truncation=True, max_length=max_length)
    return dataset.map(tokenize_function, batched=True)

comments_xlm = tokenize_dataset(comments_dataset, xlm_tokenizer)

# Step 4: Load fine-tuned XLM-RoBERTa model
xlm_model = XLMRobertaForSequenceClassification.from_pretrained("/kaggle/working/models/xlm_finetuned")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlm_model.to(device)

# Step 5: Predict toxicity
def predict_toxicity(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    probabilities = []
    comments = dataset["comment"]
    for comment in comments:
        inputs = tokenizer(comment, return_tensors="pt", padding=True, truncation=True, max_length=64)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred = probs.argmax(-1).item()
        toxic_prob = probs[0][1].item()
        predictions.append(pred)
        probabilities.append(toxic_prob)
    return predictions, probabilities

xlmr_preds, xlmr_probs = predict_toxicity(xlm_model, xlm_tokenizer, comments_dataset, device)

# Step 6: Combine and save results
results_df = pd.DataFrame({
    "Comment": comments_df["comment"],
    "XLMR_Pred": xlmr_preds,
    "XLMR_Toxic_Prob": xlmr_probs
})
results_csv_path = "/kaggle/working/xlmr_comments_predictions.csv"
results_df.to_csv(results_csv_path, index=False)
print(f"Predictions saved to {results_csv_path}")
print(results_df.head(10))  # Show first 10 rows for inspection

Converted Excel to CSV at /kaggle/working/comments.csv with 563 comments.


Map:   0%|          | 0/563 [00:00<?, ? examples/s]

Predictions saved to /kaggle/working/xlmr_comments_predictions.csv
                                             Comment  XLMR_Pred  \
0         Learning to speak is always useful. Great!          0   
1  Vocal Exercises begin at 7:50 :   1. Raise arm...          1   
2  Imagine someone being late and walking in at 8:49          0   
3  4:01 "Tempered with love, honesty is a great t...          0   
4  "If you wish people with love its really hard ...          1   
5  i feel like youtube recommendations are person...          0   
6                      I like his head it is shining          1   
7  Very fantastic video.❤❤  I came across this vi...          0   
8  0:13 Intro 0:33 7 deadly sins of speaking 2:45...          0   
9  The trick is, whenever you're talking to peopl...          1   

   XLMR_Toxic_Prob  
0         0.016863  
1         0.915897  
2         0.020808  
3         0.016115  
4         0.520468  
5         0.018492  
6         0.668092  
7         0.015165  
8     

In [10]:
import pandas as pd
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from datasets import Dataset

# Step 1: Load CSV file (instead of Excel) from Kaggle environment
csv_file_path = "/kaggle/input/youtube-toxicity-data/youtoxic_english_1000.csv"  # Replace with your actual CSV file path
comments_df = pd.read_csv(csv_file_path)

# Ensure the column name matches your CSV file (adjust if different)
if "Text" not in comments_df.columns:
    print("Column 'comment' not found. Available columns:", comments_df.columns)
    # Rename if needed, e.g., comments_df = comments_df.rename(columns={"Comments": "comment_text"})
else:
    print(f"Loaded CSV from {csv_file_path} with {len(comments_df)} comments.")

# Step 2: Convert to Hugging Face Dataset
comments_dataset = Dataset.from_pandas(comments_df[["Text"]])

# Step 3: Tokenize with your fine-tuned XLM-RoBERTa tokenizer
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("/kaggle/working/models/xlm_finetuned")  # Load from your fine-tuned path

def tokenize_dataset(dataset, tokenizer, max_length=64):
    def tokenize_function(examples):
        return tokenizer(examples["Text"], padding="max_length", truncation=True, max_length=max_length)
    return dataset.map(tokenize_function, batched=True)

comments_xlm = tokenize_dataset(comments_dataset, xlm_tokenizer)

# Step 4: Load fine-tuned XLM-RoBERTa model
xlm_model = XLMRobertaForSequenceClassification.from_pretrained("/kaggle/working/models/xlm_finetuned")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlm_model.to(device)

# Step 5: Predict toxicity
def predict_toxicity(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    probabilities = []
    comments = dataset["Text"]
    for comment in comments:
        inputs = tokenizer(comment, return_tensors="pt", padding=True, truncation=True, max_length=64)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred = probs.argmax(-1).item()
        toxic_prob = probs[0][1].item()
        predictions.append(pred)
        probabilities.append(toxic_prob)
    return predictions, probabilities

xlmr_preds, xlmr_probs = predict_toxicity(xlm_model, xlm_tokenizer, comments_dataset, device)

# Step 6: Combine and save results
results_df = pd.DataFrame({
    "Comment": comments_df["Text"],
    "XLMR_Pred": xlmr_preds,
    "XLMR_Toxic_Prob": xlmr_probs
})
results_csv_path = "/kaggle/working/xlmr_YTD_comments_predictions.csv"
results_df.to_csv(results_csv_path, index=False)
print(f"Predictions saved to {results_csv_path}")
print(results_df.head(10))  # Show first 10 rows for inspection


Loaded CSV from /kaggle/input/youtube-toxicity-data/youtoxic_english_1000.csv with 1000 comments.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Predictions saved to /kaggle/working/xlmr_YTD_comments_predictions.csv
                                             Comment  XLMR_Pred  \
0  If only people would just take a step back and...          0   
1  Law enforcement is not trained to shoot to app...          1   
2  \nDont you reckon them 'black lives matter' ba...          1   
3  There are a very large number of people who do...          0   
4  The Arab dude is absolutely right, he should h...          1   
5  here people his facebook is https://www.facebo...          1   
6  Check out this you tube post. "Black man goes ...          0   
7  I would LOVE to see this pussy go to Staten Is...          1   
8                        I agree with the protestor.          0   
9   mike browns father was made to say that boooshit          1   

   XLMR_Toxic_Prob  
0         0.147595  
1         0.997017  
2         0.989839  
3         0.057511  
4         0.951912  
5         0.989444  
6         0.183976  
7         0.997221  
8 

In [13]:
import pandas as pd
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from datasets import Dataset

# Step 1: Load CSV file (instead of Excel) from Kaggle environment
csv_file_path = "/kaggle/input/most-liked-comments-on-youtube/youtube_dataset.csv"  # Replace with your actual CSV file path
comments_df = pd.read_csv(csv_file_path)

# Ensure the column name matches your CSV file (adjust if different)
if "Comment" not in comments_df.columns:
    print("Column 'comment' not found. Available columns:", comments_df.columns)
    # Rename if needed, e.g., comments_df = comments_df.rename(columns={"Comments": "comment_text"})
else:
    print(f"Loaded CSV from {csv_file_path} with {len(comments_df)} comments.")

# Step 2: Convert to Hugging Face Dataset
comments_dataset = Dataset.from_pandas(comments_df[["Comment"]])

# Step 3: Tokenize with your fine-tuned XLM-RoBERTa tokenizer
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("/kaggle/working/models/xlm_finetuned")  # Load from your fine-tuned path

def tokenize_dataset(dataset, tokenizer, max_length=64):
    def tokenize_function(examples):
        return tokenizer(examples["Comment"], padding="max_length", truncation=True, max_length=max_length)
    return dataset.map(tokenize_function, batched=True)

comments_xlm = tokenize_dataset(comments_dataset, xlm_tokenizer)

# Step 4: Load fine-tuned XLM-RoBERTa model
xlm_model = XLMRobertaForSequenceClassification.from_pretrained("/kaggle/working/models/xlm_finetuned")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlm_model.to(device)

# Step 5: Predict toxicity
def predict_toxicity(model, tokenizer, dataset, device):
    model.eval()
    predictions = []
    probabilities = []
    comments = dataset["Comment"]
    for comment in comments:
        inputs = tokenizer(comment, return_tensors="pt", padding=True, truncation=True, max_length=64)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred = probs.argmax(-1).item()
        toxic_prob = probs[0][1].item()
        predictions.append(pred)
        probabilities.append(toxic_prob)
    return predictions, probabilities

xlmr_preds, xlmr_probs = predict_toxicity(xlm_model, xlm_tokenizer, comments_dataset, device)

# Step 6: Combine and save results
results_df = pd.DataFrame({
    "Comment": comments_df["Comment"],
    "XLMR_Pred": xlmr_preds,
    "XLMR_Toxic_Prob": xlmr_probs
})
results_csv_path = "/kaggle/working/xlmr_MLCOY_comments_predictions.csv"
results_df.to_csv(results_csv_path, index=False)
print(f"Predictions saved to {results_csv_path}")
print(results_df.head(10))  # Show first 10 rows for inspection


Loaded CSV from /kaggle/input/most-liked-comments-on-youtube/youtube_dataset.csv with 19300 comments.


Map:   0%|          | 0/19300 [00:00<?, ? examples/s]

Predictions saved to /kaggle/working/xlmr_MLCOY_comments_predictions.csv
                                             Comment  XLMR_Pred  \
0  The people who liked this comment is officiall...          0   
1           - Wait, it's 7B views\n- Always has been          0   
2  *Teacher: What is the population of the Earth?...          0   
3  Let's be honest this wasn't your recommendatio...          0   
4  Types Of People:\n10% Enjoying Song \n90% Chec...          0   
5  3.2 Million comments if you find mine your a l...          0   
6  claim your “here before 7 billion” tickets her...          0   
7  The ones who are NOT from Tik-Tok can like thi...          0   
8   Song: spanish\nComments: English\nHotel: trivago          0   
9                                    Kimler burda😂🥰🌹          0   

   XLMR_Toxic_Prob  
0         0.015564  
1         0.016622  
2         0.016170  
3         0.016580  
4         0.016175  
5         0.456017  
6         0.020753  
7         0.017485  
