In [None]:
import os
gpu_ids = [4]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import VideoMAEFeatureExtractor, VideoMAEModel
from sklearn.metrics import f1_score, recall_score, accuracy_score
from tqdm import tqdm
import random
# ---- SET GLOBAL SEED ----
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# ---- SETTINGS ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# clip_dir = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI/Clip/Clips_16frames"
mapping_csv = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Labels/new_sentiment_split_2.csv"
clip_dir = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Clip/Clips_16frames"
# mapping_csv = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Clip/clip_balanced_split.csv"
batch_size = 64
clip_len = 16
num_epochs = 20

# ---- DATASET ----
class VideoClipDataset(Dataset):
    def __init__(self, clip_dir, csv_path, feature_extractor, transform=False, duplicate_flipped=False):
        self.clip_dir = clip_dir
        self.df = pd.read_csv(csv_path)
        self.feature_extractor = feature_extractor
        self.transform = transform
        self.duplicate_flipped = duplicate_flipped

        self.samples = list(self.df.itertuples(index=False))

        if self.duplicate_flipped:
            self.samples = [(s, False) for s in self.samples] + [(s, True) for s in self.samples]
        else:
            self.samples = [(s, False) for s in self.samples]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        row, apply_flip = self.samples[idx]
        clip_path = os.path.join(self.clip_dir, row.clip_filename_y)

        cap = cv2.VideoCapture(clip_path)
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = frame[:, :, ::-1]  # BGR to RGB
            if self.transform and apply_flip:
                frame = cv2.flip(frame, 1)
            frames.append(frame)
        cap.release()

        if len(frames) < clip_len:
            frames += [frames[-1]] * (clip_len - len(frames))
        frames = frames[:clip_len]

        inputs = self.feature_extractor(images=frames, return_tensors="pt")["pixel_values"].squeeze(0)

        score = row.sentiment_score
        if score < -0.3:
            label = 0
        elif score > 0.3:
            label = 2
        else:
            label = 1

        return inputs, torch.tensor(label, dtype=torch.long)



# ---- MODEL ----
class SentimentClassifier(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 3)
        )


    def forward(self, x):
        return self.classifier(x)

# ---- TRAINING UTILS ----
def run_epoch(model, loader, optimizer, is_train=True):
    model.train() if is_train else model.eval()
    total_preds, total_labels = [], []
    total_loss = 0

    for clips, targets in tqdm(loader, leave=False):
        clips, targets = clips.to(device), targets.to(device)

        with torch.set_grad_enabled(is_train):
            features = video_mae(clips).last_hidden_state.mean(dim=1)
            preds = model(features)
            loss = loss_fn(preds, targets)

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        total_loss += loss.item()
        total_preds.extend(torch.argmax(preds, dim=1).detach().cpu().numpy())
        total_labels.extend(targets.detach().cpu().numpy())

    return total_loss / len(loader), np.array(total_preds), np.array(total_labels)

def evaluate(preds, labels):
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    preds_label = [label_map[p] for p in preds]
    labels_label = [label_map[l] for l in labels]

    macro_f1 = f1_score(labels_label, preds_label, average="macro")
    micro_f1 = f1_score(labels_label, preds_label, average="micro")
    recall = recall_score(labels_label, preds_label, average=None, labels=["Negative", "Neutral", "Positive"])
    acc = accuracy_score(labels_label, preds_label)
    return macro_f1, micro_f1, recall, acc

# ---- FEATURE EXTRACTOR ----
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
video_mae = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(device)
video_mae.eval()
video_mae.encoder.layer[-1].requires_grad_(True)
for param in video_mae.parameters():
    param.requires_grad = False

# ---- LOAD DATASET AND SPLIT WITH CUSTOM PROPORTIONS ----
df = pd.read_csv(mapping_csv)

# Define your desired proportions for the training set
# Example: use 50% of each class in the training split
proportion = {
    "Negative": 1,
    "Neutral": 0.8,
    "Positive": 1,
}

# Filter only training data
train_df = df[df["split"] == "train"]

# Apply proportional sampling for each class
train_samples = []
for label, frac in proportion.items():
    label_df = train_df[train_df["sentiment_label"] == label]
    sampled_df = label_df.sample(frac=frac, random_state=42)
    train_samples.append(sampled_df)

train_df_balanced = pd.concat(train_samples)
train_indices = train_df_balanced.index.tolist()
print(f"📦 Original training dataset size (before flipping): {len(train_indices)} samples")

# Validation and test sets remain unchanged
val_indices = df[df["split"] == "val"].index.tolist()
test_indices = df[df["split"] == "test"].index.tolist()

# Reload dataset (to avoid using the whole dataset)
full_dataset = VideoClipDataset(clip_dir, mapping_csv, feature_extractor)
# Reload dataset with optional transform for training
full_dataset_train = VideoClipDataset(
    clip_dir, mapping_csv, feature_extractor,
    transform=True, duplicate_flipped=True
)
full_dataset_val   = VideoClipDataset(clip_dir, mapping_csv, feature_extractor, transform=False)
full_dataset_test  = VideoClipDataset(clip_dir, mapping_csv, feature_extractor, transform=False)

# Extend train_indices to also include the flipped versions (shifted by len of original dataset)
doubled_train_indices = train_indices + [i + len(full_dataset.samples) // 2 for i in train_indices]
train_dataset = Subset(full_dataset_train, doubled_train_indices)


# train_dataset = Subset(full_dataset_train, train_indices)
val_dataset   = Subset(full_dataset_val, val_indices)
test_dataset  = Subset(full_dataset_test, test_indices)


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
print(f"🔍 Training dataset size (with flipping): {len(train_loader.dataset)} samples")


# ---- TRAINING SETUP ----
model = SentimentClassifier(feature_dim=768).to(device)
class_weights = torch.tensor([1.0, 0.8, 1.3], dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
# loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)


# ---- TRAIN LOOP ----
best_macro_f1 = -np.inf
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch}")
    train_loss, _, _ = run_epoch(model, train_loader, optimizer, is_train=True)
    val_loss, val_preds, val_labels = run_epoch(model, val_loader, optimizer, is_train=False)

    macro_f1, micro_f1, recall, acc = evaluate(val_preds, val_labels)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Macro-F1: {macro_f1:.4f} | Micro-F1: {micro_f1:.4f} | Acc: {acc:.4f} | Recall: {recall}")
    # scheduler.step(macro_f1)


    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(model.state_dict(), "best_classifier_CE_final.pth")
        print(f"✅ Best model saved at epoch {epoch} with Macro-F1={macro_f1:.4f}")

# ---- TEST EVALUATION ----
test_loss, test_preds, test_labels = run_epoch(model, test_loader, optimizer, is_train=False)
macro_f1, micro_f1, recall, acc = evaluate(test_preds, test_labels)
print("\n----- TEST RESULTS -----")
print(f"Macro-F1: {macro_f1:.4f} | Micro-F1: {micro_f1:.4f} | Acc: {acc:.4f} | Recall: {recall}")



Epoch 1
Train Loss: 0.9663 | Val Loss: 0.9873
Macro-F1: 0.2786 | Micro-F1: 0.6168 | Acc: 0.6168 | Recall: [0.         0.04761905 0.91549296]
✅New best model saved.

Epoch 2
Train Loss: 0.9276 | Val Loss: 0.9256
Macro-F1: 0.2659 | Micro-F1: 0.6636 | Acc: 0.6636 | Recall: [0. 0. 1.]

Epoch 3
Train Loss: 0.9283 | Val Loss: 0.9317
Macro-F1: 0.2659 | Micro-F1: 0.6636 | Acc: 0.6636 | Recall: [0. 0. 1.]

Epoch 4
Train Loss: 0.8941 | Val Loss: 0.9305
Macro-F1: 0.2614 | Micro-F1: 0.6449 | Acc: 0.6449 | Recall: [0.         0.         0.97183099]

Epoch 5
Train Loss: 0.8939 | Val Loss: 0.9582
Macro-F1: 0.2971 | Micro-F1: 0.5794 | Acc: 0.5794 | Recall: [0.         0.14285714 0.83098592]
✅New best model saved.

Epoch 6
Train Loss: 0.9051 | Val Loss: 0.9612
Macro-F1: 0.3074 | Micro-F1: 0.5140 | Acc: 0.5140 | Recall: [0.         0.33333333 0.67605634]
✅New best model saved.

Epoch 7
Train Loss: 0.8737 | Val Loss: 0.9493
Macro-F1: 0.3123 | Micro-F1: 0.5607 | Acc: 0.5607 | Recall: [0.         0.238095

In [None]:
# # ---- SAVE TEST RESULTS ----
# label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
# pred_labels = [label_map[p] for p in test_preds]

# # Get filenames from the original DataFrame using test indices
# df_test = df.iloc[test_indices].reset_index(drop=True)
# filenames = df_test["clip_filename"].tolist()

# # Create DataFrame and save to CSV
# results_df = pd.DataFrame({
#     "filename": filenames,
#     "sentiment_class": pred_labels
# })
# results_df.to_csv("test_predictions.csv", index=False)
# print("📁 Test predictions saved to 'test_predictions.csv'")


In [None]:
# import pandas as pd
# import re
# from collections import Counter

# # Load CSV
# df = pd.read_csv("test_predictions.csv")

# # Optional: If sentiment_class is still in string form, convert to numeric
# label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
# if df["sentiment_class"].dtype == object:
#     df["sentiment_class"] = df["sentiment_class"].map(label_map)

# # Strip clip suffix: anything like "_<number>_clip<number>.mp4"
# def extract_base_filename(fname):
#     return re.sub(r"_\d+_clip\d+\.mp4$", "", fname)

# df["base_filename"] = df["filename"].apply(extract_base_filename)

# # Majority vote
# def majority_vote(group):
#     vote = Counter(group["sentiment_class"]).most_common(1)[0][0]
#     return pd.Series({"sentiment_class": vote})

# aggregated_df = df.groupby("base_filename").apply(majority_vote).reset_index()
# aggregated_df.columns = ["filename", "sentiment_class"]

# # Save result
# aggregated_df.to_csv("test_predictions_aggregated.csv", index=False)
# print("✅ Aggregated predictions saved to 'test_predictions_aggregated.csv'")


In [2]:
import os
gpu_ids = [4]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import VideoMAEFeatureExtractor, VideoMAEModel
from sklearn.metrics import f1_score, recall_score, accuracy_score
from tqdm import tqdm

# ---- SET GLOBAL SEED ----
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ---- SETTINGS ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mapping_csv = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Labels/new_sentiment_split_2.csv"
clip_dir = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Clip/Clips_16frames"
batch_size = 64
clip_len = 16
checkpoint_path = "/data/home/huixian/Documents/Homeworks/535_project/mosei_code/best_classifier_CE_final_0.5796.pth"

# ---- DATASET ----
class VideoClipDataset(Dataset):
    def __init__(self, clip_dir, csv_path, feature_extractor):
        self.clip_dir = clip_dir
        self.df = pd.read_csv(csv_path)
        self.feature_extractor = feature_extractor
        self.samples = self.df[self.df["split"] == "test"].reset_index(drop=True)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        row = self.samples.iloc[idx]
        clip_path = os.path.join(self.clip_dir, row["clip_filename_y"])

        cap = cv2.VideoCapture(clip_path)
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = frame[:, :, ::-1]  # BGR to RGB
            frames.append(frame)
        cap.release()

        if len(frames) < clip_len:
            frames += [frames[-1]] * (clip_len - len(frames))
        frames = frames[:clip_len]

        inputs = self.feature_extractor(images=frames, return_tensors="pt")["pixel_values"].squeeze(0)
        return inputs, row["video_id"], row["clip_filename_y"]

# ---- MODEL ----
class SentimentClassifier(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 3)
        )

    def forward(self, x):
        return self.classifier(x)

# ---- EVALUATION UTILS ----
def run_test(model, loader):
    model.eval()
    preds, video_ids, filenames = [], [], []

    with torch.no_grad():
        for clips, vids, fnames in tqdm(loader):
            clips = clips.to(device)
            features = video_mae(clips).last_hidden_state.mean(dim=1)
            logits = model(features)
            predictions = torch.argmax(logits, dim=1).cpu().numpy()
            preds.extend(predictions)
            video_ids.extend(vids)
            filenames.extend(fnames)

    return preds, video_ids, filenames

# ---- LOAD FEATURE EXTRACTOR AND MODEL ----
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
video_mae = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(device)
video_mae.eval()

model = SentimentClassifier(feature_dim=768).to(device)
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.eval()

# ---- LOAD DATASET ----
test_dataset = VideoClipDataset(clip_dir, mapping_csv, feature_extractor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# ---- RUN INFERENCE ----
preds, video_ids, filenames = run_test(model, test_loader)

from collections import Counter

# ---- CONVERT TO LABELS ----
int_to_label = {0: "Negative", 1: "Neutral", 2: "Positive"}
label_preds = [int_to_label[p] for p in preds]

# ---- SAVE CLIP-LEVEL PREDICTIONS ----
clip_df = pd.DataFrame({
    "clip_filename": filenames,
    "video_id": video_ids,
    "predicted_label": label_preds
})
clip_df.to_csv("test_predictions_with_video_id.csv", index=False)
print("✅ Saved clip-level predictions to: test_predictions_with_video_id.csv")

# ---- MAJORITY VOTING PER VIDEO_ID ----
majority_vote = (
    clip_df.groupby("video_id")["predicted_label"]
    .apply(lambda x: Counter(x).most_common(1)[0][0])
    .reset_index()
    .rename(columns={"predicted_label": "majority_label"})
)

majority_vote.to_csv("video_level_majority_predictions.csv", index=False)
print("✅ Saved video-level majority vote to: video_level_majority_predictions.csv")



  model.load_state_dict(torch.load(checkpoint_path, map_location=device))
100%|██████████| 9/9 [00:29<00:00,  3.31s/it]

✅ Saved clip-level predictions to: test_predictions_with_video_id.csv
✅ Saved video-level majority vote to: video_level_majority_predictions.csv





In [3]:
import pandas as pd

# Path to the CSV file
csv_path = "/data/home/huixian/Documents/Homeworks/535_project/late_fusion/audio_RNN_preds.csv"

# Load the CSV
df = pd.read_csv(csv_path)

# Count unique video_id values
unique_video_ids = df["video_id"].nunique()

print(f"🎥 Number of unique video_id entries: {unique_video_ids}")


🎥 Number of unique video_id entries: 114
