In [55]:
import kaggle
import pandas as pd

import zipfile
import os

import nltk
import string
from nltk.tokenize import word_tokenize

import librosa
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

from accelerate import Accelerator


# Contexual Emotion Detection

## 1. Download Dataset

In [34]:
# # Download dataset
# kaggle.api.dataset_download_files('pashupatigupta/emotion-detection-from-text', path='.', unzip=True)


In [35]:
# # Load dataset
# dataset = pd.read_csv('Data/tweet_emotions.csv')
# print(dataset.head())

In [36]:
# # Download dataset
# kaggle.api.dataset_download_files('ritika0111/emotion-detection-dataset', path='.', unzip=True)


In [11]:
# List files in the extracted directory
print(os.listdir('Data/emotion-detection-dataset'))

['angry', 'disgusting', 'fear', 'happy', 'neutral', 'sad']


## 2. Preprocess Data

Text Data: Loaded from Data/tweet_emotions.csv, tokenized, and normalized.

Audio Data: Loaded from Data/emotion-detection-dataset, features extracted (MFCCs), and labeled.

Optional: Saved the preprocessed data for future use.

#### Text Data

In [12]:
# Load the dataset
text_dataset = pd.read_csv('Data/tweet_emotions.csv')
print(text_dataset.head())

     tweet_id   sentiment                                            content
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...
2  1956967696     sadness                Funeral ceremony...gloomy friday...
3  1956967789  enthusiasm               wants to hang out with friends SOON!
4  1956968416     neutral  @dannycastillo We want to trade with someone w...


In [13]:
labels = text_dataset['sentiment']

print(labels.unique())

['empty' 'sadness' 'enthusiasm' 'neutral' 'worry' 'surprise' 'love' 'fun'
 'hate' 'happiness' 'boredom' 'relief' 'anger']


In [14]:
# Label mapping
label_mapping = {
    'anger': 'angry',
    'disgust': 'disgusting',
    'fear': 'fear',
    'happiness': 'happy',
    'neutral': 'neutral',
    'sadness': 'sad'
}

# Map text dataset labels to audio dataset labels
text_dataset['mapped_sentiment'] = text_dataset['sentiment'].map(label_mapping)

# Filter out rows with labels that do not have a corresponding audio label
filtered_text_dataset = text_dataset.dropna(subset=['mapped_sentiment'])


In [15]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Normalization: Convert to lowercase and remove punctuation
    tokens = [token.lower() for token in tokens if token.isalpha()]
    return tokens

# Apply preprocessing
filtered_text_dataset['tokens'] = filtered_text_dataset['content'].apply(preprocess_text)
print(filtered_text_dataset.head())

     tweet_id sentiment                                            content  \
1  1956967666   sadness  Layin n bed with a headache  ughhhh...waitin o...   
2  1956967696   sadness                Funeral ceremony...gloomy friday...   
4  1956968416   neutral  @dannycastillo We want to trade with someone w...   
6  1956968487   sadness  I should be sleep, but im not! thinking about ...   
8  1956969035   sadness            @charviray Charlene my love. I miss you   

  mapped_sentiment                                             tokens  
1              sad  [layin, n, bed, with, a, headache, ughhhh, wai...  
2              sad                [funeral, ceremony, gloomy, friday]  
4          neutral  [dannycastillo, we, want, to, trade, with, som...  
6              sad  [i, should, be, sleep, but, im, not, thinking,...  
8              sad      [charviray, charlene, my, love, i, miss, you]  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_text_dataset['tokens'] = filtered_text_dataset['content'].apply(preprocess_text)


In [17]:
# Save preprocessed text data
filtered_text_dataset.to_csv('Data/preprocessed_text_data.csv', index=False)


#### Audio Data

In [18]:
def extract_features(audio_path):
    y, sr = librosa.load(audio_path)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs.T, axis=0)

In [19]:
# Directory containing audio files
audio_dir = 'Data/emotion-detection-dataset'
emotions = list(label_mapping.values())

In [20]:
audio_features = []
audio_labels = []

for emotion in emotions:
    emotion_dir = os.path.join(audio_dir, emotion)
    for file in os.listdir(emotion_dir):
        if file.endswith('.wav'):
            file_path = os.path.join(emotion_dir, file)
            features = extract_features(file_path)
            audio_features.append(features)
            audio_labels.append(emotion)

In [21]:
# Convert to numpy arrays
audio_features = np.array(audio_features)
audio_labels = np.array(audio_labels)
print(audio_features.shape, audio_labels.shape)

(6522, 13) (6522,)


In [22]:
# Save preprocessed audio data
np.save('Data/preprocessed_audio_features.npy', audio_features)
np.save('Data/audio_labels.npy', audio_labels)

## Step 3: Train Emotion Detection Models


#### Text Emotion Detection

In [23]:
# Load preprocessed text data
text_dataset = pd.read_csv('Data/preprocessed_text_data.csv')

# Prepare data for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts = list(text_dataset['content'])
labels = list(text_dataset['mapped_sentiment'])

In [40]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [41]:
# Tokenize texts
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
inputs['labels'] = torch.tensor(encoded_labels)

In [42]:
# Split data into train and test sets
train_size = 0.8
train_dataset = torch.utils.data.TensorDataset(inputs['input_ids'][:int(train_size*len(texts))], inputs['attention_mask'][:int(train_size*len(texts))], inputs['labels'][:int(train_size*len(texts))])
test_dataset = torch.utils.data.TensorDataset(inputs['input_ids'][int(train_size*len(texts)):], inputs['attention_mask'][int(train_size*len(texts)):], inputs['labels'][int(train_size*len(texts)):])


In [43]:
# Define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
