In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import torch
import kagglehub
import pandas as pd
from data_loader import CustomDataLoader
from train import TrainingLoop
from eval import Evaluate
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


## Downoad Dataset

In [3]:
# Download latest version of test dataset
test_path = kagglehub.dataset_download("mdismielhossenabir/sentiment-analysis")

print("Path to dataset files:", test_path)

Path to dataset files: /Users/jaylodha/.cache/kagglehub/datasets/mdismielhossenabir/sentiment-analysis/versions/1


In [4]:
# Download the dataset- this will have the train and val sets
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

Path to dataset files: /Users/jaylodha/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


## Read data into CSV files

#### load the train and val dfs first

In [5]:
train_df = pd.read_csv(f"{path}/twitter_training.csv", header = None)
val_df = pd.read_csv(f"{path}/twitter_validation.csv", header = None)

In [6]:
train_df.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


#### load the test df- also apply some post processing to esnure consistent column names

In [7]:
test_df = pd.read_csv(f"{test_path}/sentiment_analysis.csv")

In [8]:
# Apply camel casing
test_df['sentiment'] = test_df['sentiment'].str.capitalize()

In [9]:
test_df.rename(columns={'sentiment': 2}, inplace=True)
test_df.rename(columns={'text': 3}, inplace=True)

In [10]:
test_df.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,3,2,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,Positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",Positive,Facebook
2,2017,8,18,night,Don't angry me,Negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,Negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",Negative,Instagram


## Load data in CustomDataLoader

In [11]:
text_col = 3    # Column index for input text
label_col = 2   # Column index for labels
batch_size = 64 # Batch size for training and validation

# Initialize CustomDataLoader
custom_loader = CustomDataLoader(train_df, val_df, test_df, text_col, label_col, batch_size)

# Get train, validation and test loaders
train_loader = custom_loader.get_train_loader(shuffle=True)
val_loader = custom_loader.get_val_loader(shuffle=True)
test_loader = custom_loader.get_test_loader(shuffle=False)

## Define Model Params

In [12]:
model_params = {
    "vocab_size": train_loader.dataset.vocab_size,
    "num_embeddings": 384,
    "block_size": train_loader.dataset.block_size,
    "num_heads": 6,
    "num_layers": 6,
    "output_classes": len(train_loader.dataset.labels_lookup_dict),
    "dropout": 0.2,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu'
}

## Define Training Params

In [13]:
train_params = {
    "num_epochs": 100,
    "eval_interval": 10,
    "eval_iters": 10,
    "learning_rate": 0.01
}

## Training Loop

In [None]:
save_models_path = "./models_v1"
TrainingLoop(model_params, train_params).train(train_loader, val_loader, save_models_path, resume_path="/home/adityadev/GPTDecoder/models_v1/best_model.pth")

## Evaluate Best model- load it from checkpoint

#### Evaluate first on the validation set

In [None]:
val_loader = custom_loader.get_val_loader(shuffle=False)
best_model_path = "/Users/jaylodha/Downloads/best_model_v1.pth"
label_mapping = train_loader.dataset.reverse_labels_lookup_dict

report = Evaluate(model_params, best_model_path).evaluate(val_loader, label_mapping)

[32m2024-12-08 19:03:42.648[0m | [1mINFO    [0m | [36meval[0m:[36mload_best_model[0m:[36m27[0m - [1mBest model loaded from /Users/jaylodha/Downloads/best_model_v1.pth[0m


In [None]:
print(report)

              precision    recall  f1-score   support

  Irrelevant     0.9112    0.8953    0.9032       172
    Negative     0.9490    0.9098    0.9290       266
     Neutral     0.8684    0.9263    0.8964       285
    Positive     0.9081    0.8917    0.8998       277

    accuracy                         0.9070      1000
   macro avg     0.9092    0.9058    0.9071      1000
weighted avg     0.9082    0.9070    0.9072      1000



#### Also evaluate on the test set

In [None]:
report = Evaluate(model_params, best_model_path).evaluate(test_loader, label_mapping)

[32m2024-12-08 18:58:15.569[0m | [1mINFO    [0m | [36meval[0m:[36mload_best_model[0m:[36m27[0m - [1mBest model loaded from /Users/jaylodha/Downloads/best_model_v1.pth[0m
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(report)

              precision    recall  f1-score   support

  Irrelevant     0.0000    0.0000    0.0000         0
    Negative     0.4390    0.2687    0.3333       134
     Neutral     0.3962    0.3166    0.3520       199
    Positive     0.4734    0.5361    0.5028       166

    accuracy                         0.3768       499
   macro avg     0.3272    0.2803    0.2970       499
weighted avg     0.4334    0.3768    0.3971       499



## Infer on Raw text

In [None]:
model = Evaluate(model_params, best_model_path).model

[32m2024-12-08 18:58:39.137[0m | [1mINFO    [0m | [36meval[0m:[36mload_best_model[0m:[36m27[0m - [1mBest model loaded from /Users/jaylodha/Downloads/best_model_v1.pth[0m


In [None]:
def infer_on_raw_text(raw_text: str) -> str:

  encoded_inp = torch.tensor(train_loader.dataset.encode_text(raw_text), dtype=torch.long, device=model_params.get('device')).unsqueeze(0)
  logits = model(encoded_inp)
  probs = F.softmax(logits[0], dim=-1)
  # Find the index of the largest element
  max_index = torch.argmax(probs, dim=1)

  label = label_mapping[max_index.item()]
  return label

In [None]:
text = "I'm sad"
infer_on_raw_text(text)

'Negative'

## Visualize embeddings

In [None]:
## Some common english words for reference
common_words = [
    "happy", "sad", "angry", "excited", "joy", "love", "hate", "calm", "peace",
    "extremely", "moderately", "barely", "very", "slightly", "quick", "slow",
    "fast", "intelligent", "bright", "dull", "beautiful", "ugly", "kind", "rude",
    "strong", "weak", "powerful", "fragile", "warm", "cold", "hot", "cool",
    "rainy", "sunny", "cloudy", "clear", "dry", "wet", "rich", "poor", "wealthy",
    "humble", "loud", "quiet", "soft", "hard", "smooth", "rough"
]

### Function to generate embeddings from the loaded model

In [None]:
def get_word_embeddings(model, unique_words, tokenizer, device, aggregation="sum"):
    """
    Generates consistent-sized embeddings for each unique word using the trained model.

    Args:
        model: Trained GPTDecoder model.
        unique_words: List of unique words in the dataset.
        tokenizer: Tokenizer used during training.
        device: The device (CPU/GPU) to run the model.
        aggregation: Aggregation method for multi-token embeddings ("mean", "sum", "first").

    Returns:
        word_embeddings: Dictionary of word -> embedding.
    """
    model.eval()  # Set model to evaluation mode
    word_embeddings = {}

    for word in unique_words:
        token_ids = torch.tensor([tokenizer.encode(word)], device=device)  # Encode the word to token IDs
        position_ids = torch.arange(0, token_ids.size(1), device=device).unsqueeze(0)  # Position IDs

        # Pass through embedding layers
        token_embedding = model.token_embeddings(token_ids)  # Token embedding
        position_embedding = model.positional_embeddings(position_ids)  # Positional embedding
        embedding = token_embedding + position_embedding  # Combine token + positional embeddings

        # Aggregate embeddings to ensure consistent size
        if aggregation == "mean":
            final_embedding = embedding.mean(dim=1)  # Average pooling across tokens
        elif aggregation == "sum":
            final_embedding = embedding.sum(dim=1)  # Sum pooling across tokens
        elif aggregation == "first":
            final_embedding = embedding[:, 0, :]  # Take the first token's embedding
        else:
            raise ValueError("Invalid aggregation method. Choose from 'mean', 'sum', or 'first'.")

        word_embeddings[word] = final_embedding.squeeze(0).detach().cpu().numpy()

    return word_embeddings

In [None]:
word_embeddings = get_word_embeddings(model, common_words, custom_loader.train_dataset.tokenizer, model_params['device'])

### Function to perform dimensionality reduction- uses PCA

In [None]:
from sklearn.decomposition import PCA

def reduce_embeddings(word_embeddings, n_components=2):
    """
    Reduces the dimensionality of word embeddings using PCA.

    Args:
        word_embeddings: Dictionary of word -> embedding.
        n_components: Number of dimensions for reduced embeddings (2 or 3).

    Returns:
        reduced_embeddings: Dictionary of word -> reduced embedding.
    """
    words = list(word_embeddings.keys())
    embeddings = list(word_embeddings.values())

    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)

    return {word: reduced_embeddings[idx] for idx, word in enumerate(words)}


In [None]:
reduced_word_embeddings = reduce_embeddings(word_embeddings)

### Function to visualise embeddings- saving file as html

In [None]:
import plotly.express as px
import pandas as pd
import plotly.io as pio

def visualize_embeddings(reduced_embeddings, output_html_path="embedding_visualization_v3.html"):
    """
    Visualizes word embeddings using Plotly and saves the plot as an HTML file.

    Args:
        reduced_embeddings: Dictionary of word -> reduced embedding.
        output_html_path: Path to save the interactive HTML plot.
    """
    words = list(reduced_embeddings.keys())
    embeddings = list(reduced_embeddings.values())

    # Create a DataFrame for visualization
    if len(embeddings[0]) == 2:  # 2D Visualization
        df = pd.DataFrame(embeddings, columns=['x', 'y'])
    elif len(embeddings[0]) == 3:  # 3D Visualization
        df = pd.DataFrame(embeddings, columns=['x', 'y', 'z'])
    else:
        raise ValueError("Embeddings must be reduced to 2 or 3 dimensions.")

    df['word'] = words

    # Generate the plot
    if len(embeddings[0]) == 2:
        fig = px.scatter(df, x='x', y='y', text='word', title='Word Embeddings Visualization (2D)')
    else:
        fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', title='Word Embeddings Visualization (3D)')

    fig.update_traces(textposition='top center')

    # Save the plot as an HTML file
    pio.write_html(fig, file=output_html_path, auto_open=False)
    print(f"Plot saved as {output_html_path}")


ModuleNotFoundError: No module named 'plotly'