In [1]:

import os
import pandas as pd
import numpy as np
import cv2
import librosa
import subprocess
from transformers import BertTokenizer, BertModel
import torch
import nltk


In [2]:
print("Loading dataset...")
train_df = pd.read_csv(f'/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_train/train_emotion.csv', encoding='ISO-8859-1')
print("Dataset loaded.")


Loading dataset...
Dataset loaded.


In [3]:

video_dir = f'/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_train/train_data'


In [4]:

def get_video_clip_path(row):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)


In [5]:

train_df['video_clip_path'] = train_df.apply(get_video_clip_path, axis=1)


In [6]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Bmodel = BertModel.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:

def extract_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = Bmodel(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
    return cls_embedding


In [8]:

print("Extracting text features using BERT...")
train_df['text_features'] = train_df['Utterance'].apply(extract_bert_embeddings)
print("Text features extracted.")


Extracting text features using BERT...
Text features extracted.


In [9]:

def extract_audio_from_video(video_path, audio_path="/tmp/temp_audio.wav"):
    command = [
        "ffmpeg", 
        "-i", video_path,
        "-vn",
        "-ac", "1",
        "-ar", "16000",
        "-acodec", "pcm_s16le",
        audio_path
    ]
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)


In [10]:
def extract_audio_features(video_path):
    audio_path = "/tmp/temp_audio.wav"
    extract_audio_from_video(video_path, audio_path)
    y, sr = librosa.load(audio_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)


In [11]:

print("Extracting audio features...")
train_df['audio_features'] = train_df['video_clip_path'].apply(extract_audio_features)
print("Audio features extracted.")


Extracting audio features...
Audio features extracted.


In [12]:

print("Combining features...")
combined_features = np.concatenate(( 
    np.array(train_df['text_features'].tolist()),
    np.array(train_df['audio_features'].tolist())
), axis=1)

X = combined_features
y = train_df['Emotion']


Combining features...


In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier  # Example model

# Example data
# y = ['anger', 'joy', 'neutral', 'sadness', 'surprise']  # Your target labels

# Step 1: Label encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # y is your target variable with string labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 2: Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 3: Make predictions
y_pred = rf_model.predict(X_test)

# Step 4: Convert the predicted labels back to original class names
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Step 5: Print the results
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# If you want to print confusion matrix or other metrics, you can proceed similarly.


Accuracy: 0.6
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.08      0.14        25
           1       0.35      0.18      0.24        34
           2       0.62      0.96      0.75       108
           3       0.00      0.00      0.00        13
           4       0.67      0.40      0.50        20

    accuracy                           0.60       200
   macro avg       0.46      0.32      0.33       200
weighted avg       0.54      0.60      0.51       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.625
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.20      0.30        25
           1       0.48      0.32      0.39        34
           2       0.69      0.94      0.79       108
           3       1.00      0.08      0.14        13
           4       0.33      0.35      0.34        20

    accuracy                           0.62       200
   macro avg       0.62      0.38      0.39       200
weighted avg       0.63      0.62      0.57       200



In [15]:

test_df = pd.read_csv('/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_test/test_emotion.csv', encoding='ISO-8859-1')
test_video_dir = '/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_test/test_data'


In [16]:

test_df['video_clip_path'] = test_df.apply(get_video_clip_path, axis=1)


In [17]:

test_df['audio_features'] = test_df['video_clip_path'].apply(extract_audio_features)
test_df['text_features'] = test_df['Utterance'].apply(extract_bert_embeddings)


In [18]:
print("Shape of individual feature arrays:")
print(f"Audio Features Shape (first row): {test_df['audio_features'].iloc[0].shape}")
print(f"Text Features Shape (first row): {test_df['text_features'].iloc[0].shape}")

Shape of individual feature arrays:
Audio Features Shape (first row): (13,)
Text Features Shape (first row): (768,)


In [19]:

combined_features = np.concatenate(( 
    np.array(test_df['text_features'].tolist()),
    np.array(test_df['audio_features'].tolist())
), axis=1)

print(f"Shape of combined features (first row): {combined_features[0].shape}")
print(f"Shape of combined features (test set): {combined_features.shape}")


Shape of combined features (first row): (781,)
Shape of combined features (test set): (100, 781)


In [20]:

predictions = xgb_model.predict(combined_features)


In [21]:
# Create the submission DataFrame
submission_df = pd.DataFrame({
    'Sr No.': test_df['Sr No.'],
    'Emotion':  label_encoder.inverse_transform(predictions)

})


# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Submission file created.")


Submission file created.
