<a href="https://colab.research.google.com/github/Mu-niu13/Emotion-Analysis/blob/a/lib/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set up Connection to Github

In [2]:
## install git and configure(so we know who commit the code)
# !apt-get install git
# !git config --global user.name "USER_NAME"
# !git config --global user.email "EMAIL"

# access google drive
# from google.colab import drive
# drive.mount('/content/drive')

# clone repo
!git clone https://github.com/Mu-niu13/Emotion-Analysis.git
%cd Emotion-Analysis

# set up personal access for push/pull
from getpass import getpass
token = getpass('Enter your GitHub PAT:')
!git remote set-url origin https://{token}@github.com/Mu-niu13/Emotion-Analysis.git

Cloning into 'Emotion-Analysis'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 82 (delta 46), reused 47 (delta 19), pack-reused 0 (from 0)[K
Receiving objects: 100% (82/82), 11.92 MiB | 9.12 MiB/s, done.
Resolving deltas: 100% (46/46), done.
/content/Emotion-Analysis
Enter your GitHub PAT:··········


In [3]:
!git config --global user.name "Lulu-1121"
!git config --global user.email "hd162@duke.edu"

In [4]:
!make install

Installing dependencies...
# Install python3-venv if not present (specific to Debian/Ubuntu systems like Colab)
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,454 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,506 kB]
Get:12 http://archive.ubuntu.com/ubuntu

## Model Training

In [None]:
# Import libraries
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import DistilBertTokenizer, AdamW, get_linear_schedule_with_warmup
# Split data into train and validation sets
from sklearn.model_selection import train_test_split

# Import custom modules
from lib.data_preprocessing import load_data, get_label_columns, create_data_loader
from lib.model import EmotionClassifier
from lib.train import train_epoch
from lib.evaluate import eval_model

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load data
df = load_data('/content/Emotion-Analysis/data/go_emotions_dataset.csv')

# Prepare label columns
label_columns = get_label_columns(df)
n_classes = len(label_columns)

# Handle 'example_very_unclear' cases (optional)
# For efficiency, you may choose to exclude unclear examples
# df = df[df['example_very_unclear'] == False]

# Limit dataset size for efficiency (optional)
# df = df.sample(n=1000, random_state=42).reset_index(drop=True)


df_train, df_val = train_test_split(df, test_size=0.1, random_state=42)

# Initialize tokenizer (using DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create data loaders with reduced MAX_LEN and BATCH_SIZE
MAX_LEN = 150
BATCH_SIZE = 32

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE, label_columns)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE, label_columns)

# Initialize model (using DistilBERT)
model = EmotionClassifier(n_classes=n_classes, model_name='distilbert-base-uncased')
model = model.to(device)

# Define optimizer, scheduler, loss function
EPOCHS = 2  # Reduced from 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.BCEWithLogitsLoss().to(device)

# Enable Automatic Mixed Precision (AMP) for faster computation
scaler = torch.cuda.amp.GradScaler()



# Training loop with AMP
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')

    # Training step
    train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        scaler  # Pass the scaler to the training function
    )
    print(f'Train loss {train_loss}')

    # Validation step
    val_loss, outputs, targets, uncertainties = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val),
        n_mc_samples=3  # Reduced from 10
    )
    print(f'Validation loss {val_loss}')

    # Compute metrics
    outputs = torch.sigmoid(outputs)
    outputs = outputs.cpu().numpy()
    targets = targets.cpu().numpy()
    uncertainties = uncertainties.cpu().numpy()

    # Find the predicted class with maximum probability for each sample
    max_prob_classes = outputs.argmax(axis=1)  # Index of the class with the highest probability
    true_classes = targets.argmax(axis=1)      # True class index

    # Calculate overall accuracy
    overall_accuracy = (max_prob_classes == true_classes).mean()
    print(f'Overall Accuracy: {overall_accuracy:.4f}')

    # Per-class metrics
    for i, label in enumerate(label_columns):
        y_true = (true_classes == i).astype(int)
        y_pred = (max_prob_classes == i).astype(int)

        auc = roc_auc_score(y_true, outputs[:, i])  # Class-specific AUC

        f1 = f1_score(y_true, y_pred, zero_division=0)
        print(f'{label} - AUC: {auc:.4f}, F1: {f1:.4f}')

    # Analyze uncertainties
    avg_uncertainty = uncertainties.mean()
    print(f'Average uncertainty: {avg_uncertainty:.4f}')


results_df = pd.DataFrame(outputs, columns=label_columns)
results_df['uncertainty'] = uncertainties.mean(axis=1)
results_df['true_labels'] = list(targets)
results_df['text'] = df_val['text'].reset_index(drop=True)

# Find examples with high uncertainty
high_uncertainty = results_df.sort_values(by='uncertainty', ascending=False).head(5)
print("Examples with high uncertainty:")
print(high_uncertainty[['text', 'uncertainty']])

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

TypeError: EmotionClassifier.__init__() got an unexpected keyword argument 'model_name'