<a href="https://colab.research.google.com/github/Mu-niu13/Emotion-Analysis/blob/main/lib/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set up Connection to Github

In [1]:
## install git and configure(so we know who commit the code)
# !apt-get install git
# !git config --global user.name "USER_NAME"
# !git config --global user.email "EMAIL"

# access google drive
# from google.colab import drive
# drive.mount('/content/drive')

# clone repo
!git clone https://github.com/Mu-niu13/Emotion-Analysis.git
%cd Emotion-Analysis

# set up personal access for push/pull
from getpass import getpass
token = getpass('Enter your GitHub PAT:')
!git remote set-url origin https://{token}@github.com/Mu-niu13/Emotion-Analysis.git

Cloning into 'Emotion-Analysis'...
remote: Enumerating objects: 94, done.[K
remote: Counting objects: 100% (94/94), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 94 (delta 55), reused 50 (delta 23), pack-reused 0 (from 0)[K
Receiving objects: 100% (94/94), 11.92 MiB | 9.36 MiB/s, done.
Resolving deltas: 100% (55/55), done.
/content/Emotion-Analysis
Enter your GitHub PAT:··········


In [None]:
!make install

Installing dependencies...
# Install python3-venv if not present (specific to Debian/Ubuntu systems like Colab)
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,172 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,616 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,223 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelea

## Model Training

#### 1. Set Up

In [3]:
# import libraries
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

# import custom modules
from lib.data_preprocessing import load_data, get_label_columns, create_data_loader
from lib.model import EmotionClassifier
from lib.train import train_epoch
from lib.evaluate import eval_model

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


#### 2. Training Loop

In [4]:
# load data
df = load_data('/content/Emotion-Analysis/data/merged_filtered_data.csv')

# get label columns
label_columns = get_label_columns(df)
n_classes = len(label_columns)

# train val split
df_train, df_val = train_test_split(df, test_size=0.1, random_state=42)

# init tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# create data loaders
MAX_LEN = 150
BATCH_SIZE = 32

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE, label_columns)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE, label_columns)

# init model
model = EmotionClassifier(n_classes=n_classes, model_name='distilbert-base-uncased')
model = model.to(device)

# define optimizer, scheduler, loss function
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

# enable Automatic Mixed Precision for faster computation
scaler = torch.cuda.amp.GradScaler()

# set up early stopping
best_val_loss = float('inf')
epochs_no_improve = 0
n_epochs_stop = 3
best_model_state = None

# training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')

    train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        scaler
    )
    print(f'Train loss {train_loss}')

    # validation
    val_loss, outputs, targets = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device
    )
    print(f'Validation loss {val_loss}')

    # compute metrics
    outputs = torch.softmax(outputs, dim=1)
    outputs_np = outputs.cpu().numpy()
    targets_np = targets.cpu().numpy()

    # predicted classes
    max_prob_classes = outputs_np.argmax(axis=1)
    true_classes = targets_np

    # accuracy
    overall_accuracy = (max_prob_classes == true_classes).mean()
    print(f'Overall Accuracy: {overall_accuracy:.4f}')

    # per-class metrics
    report = classification_report(
        true_classes,
        max_prob_classes,
        target_names=label_columns,
        zero_division=0
    )
    print(report)

    # early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= n_epochs_stop:
            print('Early stopping!')
            model.load_state_dict(best_model_state)
            torch.save(model.state_dict(), 'best_model_state.bin')
            break

if best_model_state is not None:
    model.load_state_dict(best_model_state)
    torch.save(model.state_dict(), 'best_model_state.bin')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler()


Epoch 1/10


  with torch.cuda.amp.autocast():
100%|██████████| 5845/5845 [11:03<00:00,  8.82it/s]


Train loss 1.5745280619662883


100%|██████████| 650/650 [01:29<00:00,  7.30it/s]


Validation loss 1.4751659134718087
Overall Accuracy: 0.4802
                      precision    recall  f1-score   support

             neutral       0.44      0.67      0.53      5478
positive recognition       0.55      0.46      0.50      3399
               anger       0.45      0.41      0.43      2018
             sadness       0.41      0.36      0.38       690
             anxiety       0.42      0.46      0.44       361
              regret       0.38      0.20      0.26       773
           happiness       0.61      0.59      0.60      3423
          discomfort       0.41      0.22      0.29      1969
           affection       0.49      0.46      0.47       950
           curiosity       0.39      0.35      0.37       890
            surprise       0.41      0.19      0.26       831

            accuracy                           0.48     20782
           macro avg       0.45      0.40      0.41     20782
        weighted avg       0.48      0.48      0.47     20782

Epoch 2

  with torch.cuda.amp.autocast():
100%|██████████| 5845/5845 [11:16<00:00,  8.64it/s]


Train loss 1.449626219955849


100%|██████████| 650/650 [01:29<00:00,  7.28it/s]


Validation loss 1.4606749470417315
Overall Accuracy: 0.4828
                      precision    recall  f1-score   support

             neutral       0.49      0.57      0.53      5478
positive recognition       0.52      0.50      0.51      3399
               anger       0.43      0.47      0.45      2018
             sadness       0.41      0.39      0.40       690
             anxiety       0.42      0.45      0.43       361
              regret       0.34      0.25      0.29       773
           happiness       0.61      0.61      0.61      3423
          discomfort       0.38      0.32      0.35      1969
           affection       0.46      0.49      0.48       950
           curiosity       0.38      0.35      0.37       890
            surprise       0.37      0.22      0.28       831

            accuracy                           0.48     20782
           macro avg       0.44      0.42      0.43     20782
        weighted avg       0.48      0.48      0.48     20782

Epoch 3

  with torch.cuda.amp.autocast():
100%|██████████| 5845/5845 [11:15<00:00,  8.66it/s]


Train loss 1.3742933284328778


100%|██████████| 650/650 [01:30<00:00,  7.22it/s]


Validation loss 1.4675356306479528
Overall Accuracy: 0.4839
                      precision    recall  f1-score   support

             neutral       0.48      0.58      0.53      5478
positive recognition       0.53      0.49      0.51      3399
               anger       0.47      0.42      0.44      2018
             sadness       0.41      0.41      0.41       690
             anxiety       0.42      0.47      0.45       361
              regret       0.34      0.27      0.30       773
           happiness       0.61      0.61      0.61      3423
          discomfort       0.36      0.34      0.35      1969
           affection       0.48      0.49      0.48       950
           curiosity       0.39      0.32      0.35       890
            surprise       0.34      0.23      0.28       831

            accuracy                           0.48     20782
           macro avg       0.44      0.42      0.43     20782
        weighted avg       0.48      0.48      0.48     20782

Epoch 4

  with torch.cuda.amp.autocast():
100%|██████████| 5845/5845 [11:16<00:00,  8.64it/s]


Train loss 1.305825540864437


100%|██████████| 650/650 [01:30<00:00,  7.21it/s]


Validation loss 1.4959494968561027
Overall Accuracy: 0.4746
                      precision    recall  f1-score   support

             neutral       0.50      0.53      0.51      5478
positive recognition       0.53      0.47      0.50      3399
               anger       0.41      0.49      0.45      2018
             sadness       0.40      0.41      0.40       690
             anxiety       0.41      0.48      0.44       361
              regret       0.34      0.24      0.28       773
           happiness       0.57      0.64      0.60      3423
          discomfort       0.37      0.30      0.33      1969
           affection       0.44      0.50      0.47       950
           curiosity       0.35      0.36      0.36       890
            surprise       0.37      0.19      0.26       831

            accuracy                           0.47     20782
           macro avg       0.43      0.42      0.42     20782
        weighted avg       0.47      0.47      0.47     20782

Epoch 5

  with torch.cuda.amp.autocast():
100%|██████████| 5845/5845 [11:15<00:00,  8.65it/s]


Train loss 1.2440586288226172


100%|██████████| 650/650 [01:30<00:00,  7.20it/s]


Validation loss 1.540733828177819
Overall Accuracy: 0.4650
                      precision    recall  f1-score   support

             neutral       0.53      0.46      0.49      5478
positive recognition       0.51      0.50      0.50      3399
               anger       0.40      0.48      0.44      2018
             sadness       0.39      0.41      0.40       690
             anxiety       0.43      0.47      0.45       361
              regret       0.34      0.24      0.28       773
           happiness       0.57      0.62      0.59      3423
          discomfort       0.35      0.35      0.35      1969
           affection       0.43      0.52      0.47       950
           curiosity       0.33      0.40      0.36       890
            surprise       0.31      0.24      0.27       831

            accuracy                           0.46     20782
           macro avg       0.42      0.43      0.42     20782
        weighted avg       0.47      0.46      0.46     20782

Early st