In [None]:

!pip install transformers datasets


from google.colab import drive
drive.mount('/content/drive')

# Disable W&B Logging (uncomment to disable W&B completely)
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B logging


import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset


url = "https://raw.githubusercontent.com/GuviMentor88/Training-Datasets/refs/heads/main/twitter_training.csv"
column_names = ["Tweet_Id", "Entity", "label", "text"]


df = pd.read_csv(url, names=column_names, header=None)

# Preprocess the data
df_cleaned = df.copy()
label_mapping = {"Positive": 0, "Neutral": 1, "Negative": 2}

# Use .loc to avoid SettingWithCopyWarning
df_cleaned.loc[:, 'label'] = df_cleaned['label'].replace({'Irrelevant': 'Neutral'})
df_cleaned.loc[:, 'label'] = df_cleaned['label'].map(label_mapping)

# Text cleaning
df_cleaned.loc[:, 'text'] = df_cleaned['text'].str.replace(r'http\S+|www\S+|pic\.twitter\.com/\S+', '', regex=True)
df_cleaned.loc[:, 'text'] = df_cleaned['text'].str.replace(r'@\S+', '', regex=True)
df_cleaned.loc[:, 'text'] = df_cleaned['text'].str.replace(r'#\S+', '', regex=True)
df_cleaned.loc[:, 'text'] = df_cleaned['text'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
df_cleaned.loc[:, 'text'] = df_cleaned['text'].str.lower().str.strip().replace(r'\s+', ' ', regex=True)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_cleaned["text"].tolist(), df_cleaned["label"].tolist(), test_size=0.2, random_state=42
)


train_texts = [str(text) for text in train_texts]
val_texts = [str(text) for text in val_texts]

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert to Hugging Face datasets format
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")

train_dataset.set_format("torch")
val_dataset.set_format("torch")

# Load the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define training arguments (with W&B run name customization or disabling)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    
)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Save the model locally
local_model_path = "./model"
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)

# Save the model to Google Drive
drive_model_path = "/content/drive/My Drive/distilbert_model"
os.makedirs(drive_model_path, exist_ok=True)

!cp -r ./model/* "{drive_model_path}"
print(f"Model saved to Google Drive at {drive_model_path}")


from shutil import make_archive
make_archive("distilbert_model", 'zip', local_model_path)

from google.colab import files
files.download("distilbert_model.zip")

print("Model and tokenizer saved successfully!")