In [17]:
# Import required libraries
import pandas as pd
import numpy as np
import os
import cv2
import librosa
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from moviepy.editor import VideoFileClip
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load training and test data
train_df = pd.read_csv(r'Set_3\\train\\text.csv', encoding='ISO-8859-1')
test_df = pd.read_csv(r'Set_3\\test\\text.csv', encoding='ISO-8859-1')

# Define video directories
train_video_dir = r'Set_3\\train\\videos'
test_video_dir = r'Set_3\\test\\videos'

def get_video_clip_path(row, video_dir):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Add video paths to dataframes
train_df['video_clip_path'] = train_df.apply(lambda x: get_video_clip_path(x, train_video_dir), axis=1)
test_df['video_clip_path'] = test_df.apply(lambda x: get_video_clip_path(x, test_video_dir), axis=1)

# Verify data loading
print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nSample training data:")
print(train_df[['Dialogue_ID', 'Utterance_ID', 'Sentiment', 'video_clip_path']].head())
print("\nSentiment distribution in training data:")
print(train_df['Sentiment'].value_counts())

Training data shape: (1000, 11)
Test data shape: (100, 10)

Sample training data:
   Dialogue_ID  Utterance_ID Sentiment                     video_clip_path
0            0             3   neutral  Set_3\\train\\videos\dia0_utt3.mp4
1            1             3  negative  Set_3\\train\\videos\dia1_utt3.mp4
2            3             3   neutral  Set_3\\train\\videos\dia3_utt3.mp4
3            4             6   neutral  Set_3\\train\\videos\dia4_utt6.mp4
4            6             3   neutral  Set_3\\train\\videos\dia6_utt3.mp4

Sentiment distribution in training data:
Sentiment
neutral     473
negative    294
positive    233
Name: count, dtype: int64


In [18]:
import contextlib

class MultimodalFeatureExtractor:
    def __init__(self):
        # Initialize BERT tokenizer and model for text features
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.text_model = AutoModel.from_pretrained('bert-base-uncased')
        self.label_encoder = LabelEncoder()
        
    def extract_video_features(self, video_path, num_frames=20):
        try:
            cap = cv2.VideoCapture(video_path)
            frames = []
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            
            frame_indices = np.linspace(0, frame_count-1, num_frames, dtype=int)
            
            for idx in frame_indices:
                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                ret, frame = cap.read()
                if ret:
                    frame = cv2.resize(frame, (224, 224))
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                    frames.append(frame)
            
            cap.release()
            
            if frames:
                frames = np.stack(frames)
                features = {
                    'video_mean': frames.mean(),
                    'video_std': frames.std(),
                    'video_max': frames.max(),
                    'video_min': frames.min(),
                    'video_median': np.median(frames)
                }
                return features
            
        except Exception as e:
            print(f"Error processing video {video_path}: {str(e)}")
            return {
                'video_mean': 0,
                'video_std': 0,
                'video_max': 0,
                'video_min': 0,
                'video_median': 0
            }

    def extract_audio_features(self, video_path):
        try:
            with contextlib.redirect_stdout(open(os.devnull, 'w')):
                video = VideoFileClip(video_path)
                audio = video.audio
                
                if audio is not None:
                    temp_audio_path = "temp_audio.wav"
                    audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
                    
                    y, sr = librosa.load(temp_audio_path)
                    
                    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
                    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
                    
                    features = {
                        'mfcc_mean': mfcc.mean(),
                        'mfcc_std': mfcc.std(),
                        'spectral_centroid_mean': spectral_centroid.mean(),
                        'zero_crossing_rate_mean': zero_crossing_rate.mean()
                    }
                    
                    os.remove(temp_audio_path)
                    return features
                
        except Exception as e:
            print(f"Error processing audio {video_path}: {str(e)}")
            return {
                'mfcc_mean': 0,
                'mfcc_std': 0,
                'spectral_centroid_mean': 0,
                'zero_crossing_rate_mean': 0
            }

    def extract_text_features(self, text):
        try:
            inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = self.text_model(**inputs)
            
            sentence_embedding = outputs.last_hidden_state[:, 0, :].numpy()
            
            features = {
                'text_embedding_mean': sentence_embedding.mean(),
                'text_embedding_std': sentence_embedding.std(),
                'text_length': len(str(text).split())
            }
            
            return features
            
        except Exception as e:
            print(f"Error processing text: {str(e)}")
            return {
                'text_embedding_mean': 0,
                'text_embedding_std': 0,
                'text_length': 0
            }

    def extract_features(self, df, is_training=True):
        all_features = []
        
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting features"):
            features = {}
            
            # Extract all modality features
            video_features = self.extract_video_features(row['video_clip_path'])
            audio_features = self.extract_audio_features(row['video_clip_path'])
            text_features = self.extract_text_features(row['Utterance'])
            
            features.update(video_features)
            features.update(audio_features)
            features.update(text_features)
            
            # Add metadata features
            features['speaker'] = hash(row['Speaker']) % 2**32
            features['Sr No.'] = row['Sr No.']
            if is_training:
                features['Sentiment'] = row['Sentiment']
            
            all_features.append(features)
        
        features_df = pd.DataFrame(all_features)
        return features_df

In [19]:
# Extract features for training and test sets
extractor = MultimodalFeatureExtractor()

print("Extracting training features...")
train_features = extractor.extract_features(train_df, is_training=True)
print("\nTraining features shape:", train_features.shape)

print("\nExtracting test features...")
test_features = extractor.extract_features(test_df, is_training=False)
print("\nTest features shape:", test_features.shape)

# Save features to avoid recomputing
train_features.to_csv('train_features.csv', index=False)
test_features.to_csv('test_features.csv', index=False)

Extracting training features...


Extracting features: 100%|██████████| 1000/1000 [21:06<00:00,  1.27s/it]



Training features shape: (1000, 15)

Extracting test features...


Extracting features: 100%|██████████| 100/100 [02:08<00:00,  1.29s/it]


Test features shape: (100, 14)





In [22]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
import xgboost as xgb
import pandas as pd

# Load features if saved previously
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')

# Prepare the data
X = train_features.drop(['Sentiment'], axis=1)
y = train_features['Sentiment']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_features)

# Define parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')

# Perform Grid Search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the model with the best parameters
xgb_model = xgb.XGBClassifier(**best_params, random_state=42, eval_metric='mlogloss')
xgb_model.fit(X_train_scaled, y_train)

# Evaluate on validation set
val_predictions = xgb_model.predict(X_val_scaled)
print("\nValidation Set Performance:")
print(classification_report(y_val, val_predictions, target_names=label_encoder.classes_))

# Make predictions on test set
test_predictions = xgb_model.predict(X_test_scaled)
test_predictions_labels = label_encoder.inverse_transform(test_predictions)  # Convert back to original labels

# Evaluate on test set
print("\nTest Set Performance:")
#print(classification_report(test_df['Sentiment'], test_predictions_labels))

# Create submission file
submission_df = pd.DataFrame({
    'Sr No.': test_features["Sr No."],
    'Sentiment': test_predictions_labels
})

# Save the submission
submission_df.to_csv("submission.csv", index=False)
print("\nSubmission file created successfully!")

# Print sample predictions
print("\nSample predictions:")
print(submission_df.head())

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found:  {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1.0}

Validation Set Performance:
              precision    recall  f1-score   support

    negative       0.47      0.36      0.41        61
     neutral       0.57      0.86      0.69        97
    positive       0.50      0.10      0.16        42

    accuracy                           0.55       200
   macro avg       0.51      0.44      0.42       200
weighted avg       0.53      0.55      0.49       200


Test Set Performance:

Submission file created successfully!

Sample predictions:
   Sr No. Sentiment
0      62   neutral
1      72   neutral
2     112   neutral
3     120  negative
4     136  negative
