<a href="https://colab.research.google.com/github/Mu-niu13/Emotion-Analysis/blob/main/lib/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set up Connection to Github

In [1]:
## install git and configure(so we know who commit the code)
# !apt-get install git
# !git config --global user.name "USER_NAME"
# !git config --global user.email "EMAIL"

# access google drive
from google.colab import drive
drive.mount('/content/drive')

# clone repo
!git clone https://github.com/Mu-niu13/Emotion-Analysis.git
%cd Emotion-Analysis

# set up personal access for push/pull
from getpass import getpass
token = getpass('Enter your GitHub PAT:')
!git remote set-url origin https://{token}@github.com/Mu-niu13/Emotion-Analysis.git

Mounted at /content/drive
Cloning into 'Emotion-Analysis'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 19 (delta 5), reused 12 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (19/19), 8.43 MiB | 13.26 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/Emotion-Analysis
Enter your GitHub PAT:··········


In [15]:
!make install

Installing dependencies...
# Install python3-venv if not present (specific to Debian/Ubuntu systems like Colab)
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,172 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2

## Model Training

In [1]:
# Import libraries
import torch
import torch.nn as nn
import numpy as np

from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup

# Import custom modules
from lib.data_preprocessing import load_data, get_label_columns, create_data_loader
from lib.model import EmotionClassifier
from lib.train import train_epoch
from lib.evaluate import eval_model

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data
df = load_data('../data/gi_emotions_dataset.csv')

# Prepare label columns
label_columns = get_label_columns(df)
n_classes = len(label_columns)

# Handle 'example_very_unclear' cases (optional)
# df = df[df['example_very_unclear'] == False]

# Split data into train and validation sets
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, test_size=0.1, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create data loaders
MAX_LEN = 128
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE, label_columns)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE, label_columns)

# Initialize model
model = EmotionClassifier(n_classes=n_classes)
model = model.to(device)

# Define optimizer, scheduler, loss function
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.BCEWithLogitsLoss().to(device)

# Training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    print(f'Train loss {train_loss}')

    val_loss, outputs, targets, uncertainties = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val),
        n_mc_samples=10
    )
    print(f'Validation loss {val_loss}')

    # Compute metrics
    outputs = torch.sigmoid(outputs)
    outputs = outputs.cpu().numpy()
    targets = targets.cpu().numpy()
    uncertainties = uncertainties.cpu().numpy()

    # For example, compute ROC-AUC or F1 score per class
    from sklearn.metrics import f1_score, roc_auc_score

    for i, label in enumerate(label_columns):
        y_true = targets[:, i]
        y_pred = outputs[:, i]
        auc = roc_auc_score(y_true, y_pred)
        y_pred_label = (y_pred >= 0.5).astype(int)
        f1 = f1_score(y_true, y_pred_label)
        print(f'{label} - AUC: {auc:.4f}, F1: {f1:.4f}')

    # Analyze uncertainties
    avg_uncertainty = uncertainties.mean()
    print(f'Average uncertainty: {avg_uncertainty:.4f}')

# Analyzing examples with high uncertainty
import pandas as pd

results_df = pd.DataFrame(outputs, columns=label_columns)
results_df['uncertainty'] = uncertainties.mean(axis=1)
results_df['true_labels'] = list(targets)
results_df['text'] = df_val['text'].reset_index(drop=True)

# Find examples with high uncertainty
high_uncertainty = results_df.sort_values(by='uncertainty', ascending=False).head(5)
print("Examples with high uncertainty:")
print(high_uncertainty[['text', 'uncertainty']])


KeyboardInterrupt: 