In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [3]:
import torch
import kagglehub
import pandas as pd
from data_loader import CustomDataLoader
from train import TrainingLoop
from eval import Evaluate
import torch.nn.functional as F

## Downoad Dataset

In [26]:
# Download latest version of test dataset
test_path = kagglehub.dataset_download("mdismielhossenabir/sentiment-analysis")

print("Path to dataset files:", test_path)

Path to dataset files: /Users/jaylodha/.cache/kagglehub/datasets/mdismielhossenabir/sentiment-analysis/versions/1


In [21]:
# Download the dataset- this will have the train and val sets
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

Path to dataset files: /Users/jaylodha/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


## Read data into CSV files

#### load the train and val dfs first

In [22]:
train_df = pd.read_csv(f"{path}/twitter_training.csv", header = None)
val_df = pd.read_csv(f"{path}/twitter_validation.csv", header = None)

In [23]:
train_df.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


#### load the test df- also apply some post processing to esnure consistent column names

In [27]:
test_df = pd.read_csv(f"{test_path}/sentiment_analysis.csv")

In [28]:
# Apply camel casing
test_df['sentiment'] = test_df['sentiment'].str.capitalize()

In [29]:
test_df.rename(columns={'sentiment': 2}, inplace=True)
test_df.rename(columns={'text': 3}, inplace=True)

In [31]:
test_df.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,3,2,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,Positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",Positive,Facebook
2,2017,8,18,night,Don't angry me,Negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,Negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",Negative,Instagram


## Load data in CustomDataLoader

In [32]:
text_col = 3    # Column index for input text
label_col = 2   # Column index for labels
batch_size = 8 # Batch size for training and validation

# Initialize CustomDataLoader
custom_loader = CustomDataLoader(train_df, val_df, test_df, text_col, label_col, batch_size)

# Get train, validation and test loaders
train_loader = custom_loader.get_train_loader(shuffle=True)
val_loader = custom_loader.get_val_loader(shuffle=True)
test_loader = custom_loader.get_test_loader(shuffle=False)

## Define Model Params

In [22]:
model_params = {
    "vocab_size": train_loader.dataset.vocab_size,
    "num_embeddings": 64,
    "block_size": train_loader.dataset.block_size,
    "num_heads": 4,
    "num_layers": 4,
    "output_classes": len(train_loader.dataset.labels_lookup_dict),
    "dropout": 0.2,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu'
}

## Define Training Params

In [23]:
train_params = {
    "num_epochs": 100,
    "eval_interval": 10,
    "eval_iters": 10,
    "learning_rate": 0.01
}

## Training Loop

In [24]:
save_models_path = "./models_v1"
TrainingLoop(model_params, train_params).train(train_loader, val_loader, save_models_path, resume_path="/home/adityadev/GPTDecoder/models_v1/best_model.pth")

  checkpoint = torch.load(load_path, map_location=self.device)  # Ensure checkpoint is loaded to the correct device
[32m2024-11-23 20:34:58.091[0m | [1mINFO    [0m | [36mtrain[0m:[36mload_checkpoint[0m:[36m91[0m - [1mCheckpoint loaded from: /home/adityadev/GPTDecoder/models_v1/best_model.pth, resuming from epoch 0[0m
[32m2024-11-23 20:34:58.801[0m | [1mINFO    [0m | [36mtrain[0m:[36mtrain[0m:[36m130[0m - [1mFor epoch 0: Train loss-> 1.3554059267044067 | Val loss-> 1.3709877729415894[0m
[32m2024-11-23 20:34:59.095[0m | [1mINFO    [0m | [36mtrain[0m:[36msave_checkpoint[0m:[36m72[0m - [1mCheckpoint saved: ./models_v1/checkpoint_epoch_0.pth[0m
[32m2024-11-23 20:35:00.911[0m | [1mINFO    [0m | [36mtrain[0m:[36mtrain[0m:[36m130[0m - [1mFor epoch 10: Train loss-> 1.420730435848236 | Val loss-> 1.3868310570716857[0m
[32m2024-11-23 20:35:01.212[0m | [1mINFO    [0m | [36mtrain[0m:[36msave_checkpoint[0m:[36m72[0m - [1mCheckpoint saved: ./

## Evaluate Best model- load it from checkpoint

#### Evaluate first on the validation set

In [25]:
val_loader = custom_loader.get_val_loader(shuffle=False)
best_model_path = "/home/adityadev/GPTDecoder/models_v1/best_model.pth"
label_mapping = train_loader.dataset.reverse_labels_lookup_dict

report = Evaluate(model_params, best_model_path).evaluate(val_loader, label_mapping)

  checkpoint = torch.load(self.best_model_path, map_location=self.device)
[32m2024-11-23 20:35:51.928[0m | [1mINFO    [0m | [36meval[0m:[36mload_best_model[0m:[36m27[0m - [1mBest model loaded from /home/adityadev/GPTDecoder/models_v1/best_model.pth[0m
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
print(report)

              precision    recall  f1-score   support

  Irrelevant     0.0000    0.0000    0.0000       172
    Negative     0.0000    0.0000    0.0000       266
     Neutral     0.3067    0.2561    0.2792       285
    Positive     0.2754    0.7545    0.4035       277

    accuracy                         0.2820      1000
   macro avg     0.1455    0.2527    0.1707      1000
weighted avg     0.1637    0.2820    0.1913      1000



#### Also evaluate on the test set

In [None]:
report = Evaluate(model_params, best_model_path).evaluate(test_loader, label_mapping)

## Infer on Raw text

In [34]:
model = Evaluate(model_params, best_model_path).model

  checkpoint = torch.load(self.best_model_path, map_location=self.device)
[32m2024-11-23 20:42:12.094[0m | [1mINFO    [0m | [36meval[0m:[36mload_best_model[0m:[36m27[0m - [1mBest model loaded from /home/adityadev/GPTDecoder/models_v1/best_model.pth[0m


In [None]:
def infer_on_raw_text(raw_text: str) -> str:

  encoded_inp = torch.tensor(train_loader.dataset.encode_text(raw_text), dtype=torch.long, device=model_params.get('device')).unsqueeze(0)
  logits = model(encoded_inp)
  probs = F.softmax(logits[0], dim=-1)
  # Find the index of the largest element
  max_index = torch.argmax(probs, dim=1)

  label = label_mapping[max_index.item()]
  return label

In [36]:
text = "I'm very happy today"
infer_on_raw_text(text)

'Positive'