<a href="https://colab.research.google.com/github/JasperAD11/Sentiment-Across-Signals-Neural-Networks-vs.-LLMs/blob/deliverable/notebook_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 1

## Libraries and GloVe

In [1]:
import tensorflow as tf
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras import layers, models, initializers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import TextVectorization, Input, Embedding, LSTM, Dropout, Dense
from tensorflow.keras.initializers import Constant
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## Binary model

### Dataset

In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  5305k      0  0:00:15  0:00:15 --:--:-- 13.3M


In [3]:
# Directory path
dataset_dir = "aclImdb"

# Remove unsup data (not labeled)
shutil.rmtree(os.path.join(dataset_dir, 'train', 'unsup'))

# Load training and test sets
batch_size = 32
seed = 42

train_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, "train"),
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=seed
)

val_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, "train"),
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=seed
)

test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, "test"),
    batch_size=batch_size
)

# To train the Final Model
full_train_ds = train_ds.concatenate(val_ds).shuffle(10000)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [4]:
max_vocab = 20000
sequence_len = 300

vectorizer = TextVectorization(
    max_tokens=max_vocab,
    output_mode='int',
    output_sequence_length=sequence_len
)

# Adapt vectorizer on training data
text_only_train = train_ds.map(lambda x, y: x)
vectorizer.adapt(text_only_train)

In [5]:
# Convert datasets to NumPy arrays or tensors
def vectorize_dataset(ds):
    return ds.map(lambda x, y: (vectorizer(x), y)).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

train_ds = vectorize_dataset(train_ds)
val_ds = vectorize_dataset(val_ds)
test_ds = vectorize_dataset(test_ds)
full_train_ds = vectorize_dataset(full_train_ds)


### Final Binary Model (model 2 in notebook1)

In [6]:
model_binary = keras.Sequential([
    layers.Embedding(input_dim=max_vocab, output_dim=128),
    layers.GlobalAveragePooling1D(),

    # Dense layer 1
    layers.Dense(8, activation='relu'),

    # Dense layer 2
    layers.Dense(8, activation='relu'),

    # Dense layer 3 (Output)
    layers.Dense(1, activation='sigmoid')  # Binary classification
])

model_binary.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [7]:
history = model_binary.fit(
    full_train_ds,
    validation_data = test_ds,
    epochs=100,
    callbacks = [
        EarlyStopping(monitor='val_AUC', patience=10, restore_best_weights=True, mode='max'),
        ModelCheckpoint('best_model_binary.h5', monitor='val_accuracy', save_best_only=True, mode='max')]
)

model_binary.summary()

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6301 - loss: 0.6197

  current = self.get_monitor_value(logs)


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.6302 - loss: 0.6196 - val_accuracy: 0.8548 - val_loss: 0.3526
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8688 - loss: 0.3078



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.8688 - loss: 0.3078 - val_accuracy: 0.8742 - val_loss: 0.3075
Epoch 3/20
[1m769/782[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.9064 - loss: 0.2318



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9065 - loss: 0.2317 - val_accuracy: 0.8754 - val_loss: 0.3067
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9279 - loss: 0.1883 - val_accuracy: 0.8733 - val_loss: 0.3203
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9425 - loss: 0.1571 - val_accuracy: 0.8683 - val_loss: 0.3481
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9522 - loss: 0.1373 - val_accuracy: 0.8457 - val_loss: 0.4238
Epoch 7/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9435 - loss: 0.1502 - val_accuracy: 0.7756 - val_loss: 0.6509
Epoch 8/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9447 - loss: 0.1472 - val_

## Multi-class model

### Dataset

In [11]:
# Unzip to a folder
!unzip emotions-goemotions.zip -d emotions_data

Archive:  emotions-goemotions.zip
   creating: emotions_data/goemotions/
  inflating: emotions_data/goemotions/.DS_Store  
  inflating: emotions_data/__MACOSX/goemotions/._.DS_Store  
   creating: emotions_data/goemotions/data/
  inflating: emotions_data/__MACOSX/goemotions/._data  
  inflating: emotions_data/goemotions/data/.DS_Store  
  inflating: emotions_data/__MACOSX/goemotions/data/._.DS_Store  
   creating: emotions_data/goemotions/data/full_dataset/
  inflating: emotions_data/__MACOSX/goemotions/data/._full_dataset  
  inflating: emotions_data/goemotions/data/full_dataset/goemotions_1.csv  
  inflating: emotions_data/__MACOSX/goemotions/data/full_dataset/._goemotions_1.csv  
  inflating: emotions_data/goemotions/data/full_dataset/goemotions_3.csv  
  inflating: emotions_data/goemotions/data/full_dataset/goemotions_2.csv  


In [12]:
dataset = pd.read_csv('emotions_data/goemotions/data/full_dataset/goemotions_1.csv')

In [13]:
dataset.drop(columns=["id","author","subreddit","link_id","parent_id","created_utc","rater_id","example_very_unclear"], inplace=True)

In [14]:
# vectorizer.adapt(dataset['text'].values)

X = vectorizer(dataset['text'].values)

y = dataset.drop(columns=['text'])

In [15]:
X_numpy = X.numpy() if isinstance(X, tf.Tensor) else X
X_train_full, X_test, y_train_full, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42)

# Second split: Take 20% of training for validation (16% of original)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=42
)

### Final Multi-class Model (model 5 in notebook1)

In [26]:
model_multi_class = keras.Sequential([
    layers.Embedding(input_dim=max_vocab, output_dim=128),
    layers.GlobalAveragePooling1D(),

    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),

    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),

    layers.Dense(28, activation='sigmoid')
])

model_multi_class.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['AUC'])

In [36]:
history = model_multi_class.fit(X_train_full,
                                y_train_full,
                                epochs=100,
                                batch_size=32,
                                #validation_data=(X_val, y_val),
                                validation_data=(X_test, y_test),
                                callbacks = [
                                    EarlyStopping(monitor='val_AUC', patience=10, restore_best_weights=True),
                                    ModelCheckpoint('best_model_multi.h5', monitor='val_AUC', save_best_only=True, mode='max')]
)
model_multi_class.summary()

Epoch 1/100
[1m1744/1750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - AUC: 0.8694 - loss: 0.1255



[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - AUC: 0.8694 - loss: 0.1255 - val_AUC: 0.8491 - val_loss: 0.1324
Epoch 2/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - AUC: 0.8681 - loss: 0.1259 - val_AUC: 0.8275 - val_loss: 0.1388
Epoch 3/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - AUC: 0.8672 - loss: 0.1264 - val_AUC: 0.8370 - val_loss: 0.1366
Epoch 4/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - AUC: 0.8684 - loss: 0.1263 - val_AUC: 0.8468 - val_loss: 0.1326
Epoch 5/100
[1m1748/1750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - AUC: 0.8687 - loss: 0.1256



[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - AUC: 0.8687 - loss: 0.1256 - val_AUC: 0.8508 - val_loss: 0.1320
Epoch 6/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - AUC: 0.8723 - loss: 0.1249 - val_AUC: 0.8458 - val_loss: 0.1338
Epoch 7/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - AUC: 0.8723 - loss: 0.1247



[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - AUC: 0.8723 - loss: 0.1247 - val_AUC: 0.8548 - val_loss: 0.1306
Epoch 8/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - AUC: 0.8722 - loss: 0.1246 - val_AUC: 0.8455 - val_loss: 0.1336
Epoch 9/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - AUC: 0.8725 - loss: 0.1243 - val_AUC: 0.8498 - val_loss: 0.1325
Epoch 10/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - AUC: 0.8716 - loss: 0.1249 - val_AUC: 0.8411 - val_loss: 0.1386
Epoch 11/100
[1m1735/1750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - AUC: 0.8743 - loss: 0.1237



[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - AUC: 0.8743 - loss: 0.1237 - val_AUC: 0.8564 - val_loss: 0.1302
Epoch 12/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - AUC: 0.8752 - loss: 0.1236 - val_AUC: 0.8532 - val_loss: 0.1321
Epoch 13/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - AUC: 0.8762 - loss: 0.1232 - val_AUC: 0.8483 - val_loss: 0.1349
Epoch 14/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - AUC: 0.8755 - loss: 0.1234 - val_AUC: 0.8504 - val_loss: 0.1321
Epoch 15/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - AUC: 0.8758 - loss: 0.1235 - val_AUC: 0.8433 - val_loss: 0.1345
Epoch 16/100
[1m1750/1750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - AUC: 0.8758 - loss: 0.1231 - val_AUC: 0.8455 - val_loss: 0.1338
E

## Merging **Binary** and **Multi class** models

In [28]:
def create_ensemble_model(sentiment_model_path, emotion_model_path):
    # Load the models
    sentiment_model = load_model(sentiment_model_path)
    emotion_model = load_model(emotion_model_path)

    # Freeze the models to prevent training
    sentiment_model.trainable = False
    emotion_model.trainable = False

    # Define new input layers
    sentiment_input = Input(shape=sentiment_model.input_shape[1:], name="sentiment_input")
    emotion_input = Input(shape=emotion_model.input_shape[1:], name="emotion_input")

    # Pass the inputs through the respective models
    sentiment_output = sentiment_model(sentiment_input)
    emotion_output = emotion_model(emotion_input)

    # Create the joint model
    joint_model = Model(
        inputs=[sentiment_input, emotion_input],
        outputs=[sentiment_output, emotion_output]
    )

    return joint_model

In [29]:
joint_model=create_ensemble_model('best_model_binary.h5', 'best_model_multi.h5')



In [30]:
joint_model.summary()

In [31]:
# Define emotion labels (adjust to your actual labels)
emotion_labels = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
                  'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
                  'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
                  'remorse', 'sadness', 'surprise', 'neutral']

def predict_ensemble_model(model, texts, vectorizer, emotion_labels=emotion_labels, max_length=300, neutral_threshold=0.3, emotion_threshold=0.15):
    # Tokenize and pad the input texts
    input = vectorizer(texts)

    # Make predictions with the joint model
    predictions = model.predict({
        'sentiment_input': input,
        'emotion_input': input
    })

    # Get the sentiment prediction
    sentiment_prediction = predictions[0]

    # Convert sentiment prediction to 'positive' or 'negative' based on threshold of 0.5
    sentiment_label = "positive" if sentiment_prediction[0] > 0.5 else "negative"

    # Get emotion predictions
    emotion_predictions = predictions[1]

    # Map the emotion predictions to the emotion labels
    emotion_results = {emotion_labels[i]: emotion_predictions[0][i] for i in range(len(emotion_labels))}

    # Check if 'neutral' emotion has score > neutral_threshold
    if emotion_results.get('neutral', 0) >= neutral_threshold:
        # If neutral is above the threshold, only return "neutral"
        return {
            'sentiment': sentiment_label,
            'emotion': ['neutral']
        }

    # Filter emotions: return all emotions > emotion_threshold, excluding 'neutral'
    filtered_emotions = {emotion: score for emotion, score in emotion_results.items() if score > emotion_threshold and emotion != 'neutral'}

    # If no emotions are above the threshold, return only the emotion with the highest score, excluding 'neutral'
    if not filtered_emotions:
        max_emotion = max((emotion_results[key], key) for key in emotion_results if key != 'neutral')
        filtered_emotions = {max_emotion[1]: max_emotion[0]}

    # Return the predictions
    return {
        'sentiment': sentiment_label,  # Sentiment prediction as 'positive' or 'negative'
        'emotion': list(filtered_emotions.keys())  # List of emotions above threshold or best emotion
    }


In [32]:
predict_ensemble_model(joint_model, ["I am so excited!"], vectorizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374ms/step


{'sentiment': 'negative', 'emotion': ['admiration', 'joy']}

In [33]:
!pip install openai-whisper

Collecting openai-whisper
  Using cached openai-whisper-20240930.tar.gz (800 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->openai-whisper)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->openai-whispe

In [34]:
!unzip Audios.zip -d audios_data

Archive:  Audios.zip
replace audios_data/audios/audio1.mp4? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [35]:
import os
import whisper
import pandas as pd

# Load Whisper model
model = whisper.load_model("base")

# Folder where your .mp4 files are located
folder = "/content/audios_data/audios"

# Transcribe each file
transcripts = []
for filename in os.listdir(folder):
    if filename.endswith(".mp4"):
        path = os.path.join(folder, filename)
        print(f"Transcribing: {filename}")
        result = model.transcribe(path, fp16=False)
        transcripts.append({
            "filename": filename,
            "whisper_transcription": result["text"]
        })

# Create DataFrame
df_transcripts = pd.DataFrame(transcripts)

# Optional: Show or save
print(df_transcripts)
df_transcripts.to_csv("/content/transcriptions.csv", index=False)


Exception ignored in: <function _xla_gc_callback at 0x7ed1672404a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 


KeyboardInterrupt: 