In [1]:
import pandas as pd
from google.colab import drive
# drive.mount('/content/drive')
df = pd.read_csv('/content/labeled_data.csv')

In [2]:
!pip install --upgrade transformers



In [3]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [7]:
# Step 1: Install dependencies (uncomment if needed)
# !pip install transformers datasets sklearn

# Step 2: Imports
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 3: Load and prepare dataset
df = pd.read_csv("labeled_data.csv")
df = df[['tweet', 'class']]
df = df.dropna()

# Convert labels to string for T5
label_map = {0: "hate", 1: "offensive", 2: "neither"}
df['label'] = df['class'].map(label_map)
df = df.rename(columns={"tweet": "text"})

# Step 4: Train/Val Split
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)

# Step 5: Convert to Hugging Face Datasets
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df.reset_index(drop=True)),
    'test': Dataset.from_pandas(val_df.reset_index(drop=True))
})

# Step 6: Tokenization
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess(example):
    input_text = "classify: " + example["text"]
    target_text = example["label"]

    # Tokenize input and target separately
    input_encodings = tokenizer(
        input_text, max_length=128, padding="max_length", truncation=True
    )
    target_encodings = tokenizer(
        target_text, max_length=5, padding="max_length", truncation=True
    )

    # Replace padding token IDs in labels with -100 (ignored in loss)
    labels = target_encodings["input_ids"]
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    input_encodings["labels"] = labels
    return input_encodings

# tokenized_datasets = dataset.map(preprocess, batched=False)
# tokenized_datasets.set_format(type="torch")
tokenized_datasets = dataset.map(preprocess, batched=False)

# 🔥 Remove raw columns
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'label'])

# 🔧 Set tensor format
tokenized_datasets.set_format(type="torch")
# Step 7: Load T5 Model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Step 8: Training Arguments
training_args = TrainingArguments(
    output_dir="./t5_results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=3e-4,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    logging_dir="./logs",
    # predict_with_generate=True
)

# Step 9: Compute Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    acc = accuracy_score(decoded_labels, decoded_preds)
    return {"accuracy": acc}

# Step 10: Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Step 11: Train the Model
trainer.train()

Map:   0%|          | 0/22304 [00:00<?, ? examples/s]

Map:   0%|          | 0/2479 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
input_text = "classify: You're a nasty loser, nobody likes you."
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Generate prediction
outputs = model.generate(**inputs)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Predicted class: {prediction}")

In [None]:
# Get predictions
outputs = trainer.predict(tokenized_datasets["test"])
decoded_preds = tokenizer.batch_decode(outputs.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(outputs.label_ids, skip_special_tokens=True)

# Accuracy
print("Accuracy:", accuracy_score(decoded_labels, decoded_preds))

# Classification report
print("\nClassification Report:")
print(classification_report(decoded_labels, decoded_preds, target_names=["hate", "offensive", "neither"]))