In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from huggingface_hub import HfApi
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


import os
import torch
os.environ["WANDB_DISABLED"] = "true"


In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# **Loading & Understanding The Dataset**

In [None]:
train=pd.read_csv('/content/drive/MyDrive/Sentiment140/train_data.csv')
test=pd.read_csv('/content/drive/MyDrive/Sentiment140/test_data.csv')

In [None]:
# Get the first 35000 rows
first_5000 = train.iloc[:50000]

# Get the last 35000 rows
last_5000 = train.iloc[-50000:]

# Concatenate the first and last 35000 rows
train = pd.concat([first_5000, last_5000])


In [None]:
train.head()

In [None]:
print(f"shape of training data {train.shape}")
print(f"shape of testing data {test.shape}")

In [None]:
train.info() # no missing values

In [None]:
test.info() # no missing values

In [None]:
# sentiment class is balanced
sns.countplot(x="sentiment",data=train)
plt.show()

In [None]:
import re

def clean_text(text):
    # Check if text is a string before applying regex
    if isinstance(text, str):
        text = re.sub(r'http\S+', '', text)            # remove URLs
        text = re.sub(r'@\w+', '', text)               # remove mentions
        text = re.sub(r'#\w+', '', text)               # remove hashtags
        text = re.sub(r'[^\w\s]', '', text)            # remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()       # remove extra spaces
        return text.lower()
    else:
        # Handle non-string inputs (e.g., integers)
        return str(text)  # Convert to string or handle as needed

train['sentence'] = train['sentence'].apply(clean_text)
test['sentence'] = test['sentence'].apply(clean_text)

In [None]:
# Split the data into train and eval sets (70% train, 30% eval)
traindf, evaldf = train_test_split(train, shuffle=True, test_size=0.3, random_state=2001,stratify=train["sentiment"])

# Reset index after splitting and drop the old index
traindf.reset_index(drop=True, inplace=True)
evaldf.reset_index(drop=True, inplace=True)

# Check the shapes of the resulting datasets
traindf.shape, evaldf.shape


# **Fine Tuning**

## **Model**

In [None]:
#model_id="huawei-noah/TinyBERT_General_4L_312D"
#model_id = "bert-base-uncased"
model_id = "vinai/bertweet-base"


In [None]:
model=AutoModelForSequenceClassification.from_pretrained(model_id,num_labels=2)
model=model.to(device)
model

## **Tokenizer**

In [None]:
tokenizer=AutoTokenizer.from_pretrained(model_id)
tokenizer

## **Evaluation**

In [None]:
# The input is pred, which is a dict containing both predictions and label_ids.

# The shape of pred.predictions (logits) is (batch_size, num_labels).
# It contains the predicted raw scores (logits) for each class in the classification task.
# Example shape: (64, 2) for a batch of 64 samples and 2 possible labels.

# The shape of pred.label_ids (true labels) is (batch_size).
# It contains the true labels for each sample in the batch.
# Example shape: (64,) for a batch of 64 samples.

# Using argmax(-1) reduces the logits to predicted class labels by selecting the index of the highest score.
# The shape of preds after argmax(-1) is (batch_size), which matches the shape of pred.label_ids.
"""(
def compute_metrics(pred):
    # Convert logits to predicted class labels using argmax(-1)
    predictions = pred.predictions.argmax(-1)

    # Extract the true labels
    true_labels = pred.label_ids

    # Return accuracy score comparing predicted labels and true labels
    return {"accuracy": accuracy_score(true_labels, predictions)}
)"""
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


## **Trainer**

In [None]:
training_args = TrainingArguments(
    output_dir="./model_results",
    eval_strategy="epoch",
    num_train_epochs=15,  # Avoid overfitting beyond this
    learning_rate=1e-5,  # Fine-tuned LR
    lr_scheduler_type="linear",  # Prevents drastic changes
    adam_beta2=0.98,  # Helps stabilize training
    logging_steps=50,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_ratio=0.2,
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,
    fp16=True,
)


# **Train The Model**

## **Tokenize Dataset**

In [None]:
from datasets import Dataset
# Rename the columns first
traindf = traindf.rename(columns={"sentiment": "labels"})
evaldf = evaldf.rename(columns={"sentiment": "labels"})

trainds = Dataset.from_pandas(traindf)
evalds = Dataset.from_pandas(evaldf)

trainds

In [None]:
def tokenization(batch):
    return tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=128)  # Adjust max_length if needed

In [None]:
encoded_train = trainds.map(tokenization, batched=True)
encoded_eval = evalds.map(tokenization, batched=True)
encoded_eval

## **Train**

In [None]:
"""trainer = Trainer(  # Corrected variable name
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    compute_metrics=compute_metrics  # Ensure this is defined properly
)"""
# Add this to your imports
from transformers import Trainer, TrainingArguments

# Re-initialize the Trainer with your model and arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# Evaluate on validation set after training
eval_results = trainer.evaluate()
print("Validation Results:", eval_results)