In [None]:
import os
gpu_ids = [4]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
import random
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import VideoMAEFeatureExtractor, VideoMAEModel
from sklearn.metrics import f1_score, recall_score, accuracy_score
from tqdm import tqdm

# ---- SETTINGS ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


clip_dir = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI/Clip/Clips_16frames"
mapping_csv = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI/Clip/clip_sentiment_mapping_final.csv"

negative_samples = 1500
neutral_samples = 2000
positive_samples = 1500
batch_size = 16
clip_len = 16
num_epochs = 20

# ---- DATASET ----
# class VideoClipDataset(Dataset):
#     def __init__(self, clip_dir, csv_path, feature_extractor):
#         self.clip_dir = clip_dir
#         self.df = pd.read_csv(csv_path)
#         self.feature_extractor = feature_extractor

#         # Group by sentiment label
#         grouped = self.df.groupby("sentiment_label")

#         self.samples = []
#         for label, n_samples in zip(["Negative", "Neutral", "Positive"], [negative_samples, neutral_samples, positive_samples]):
#             group = grouped.get_group(label)
#             if label == "Negative":
#                 sorted_group = group.sort_values("sentiment_score")
#             elif label == "Neutral":
#                 sorted_group = group.reindex((group["sentiment_score"] - 0).abs().sort_values().index)
#             else:  # Positive
#                 sorted_group = group.sort_values("sentiment_score", ascending=False)

#             selected = sorted_group.head(n_samples)
#             self.samples.extend(selected.itertuples(index=False))
class VideoClipDataset(Dataset):
    def __init__(self, clip_dir, csv_path, feature_extractor):
        self.clip_dir = clip_dir
        self.df = pd.read_csv(csv_path)
        self.feature_extractor = feature_extractor

        # Use all samples directly
        self.samples = self.df.itertuples(index=False)


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        row = self.samples[idx]
        clip_path = os.path.join(self.clip_dir, row.clip_filename)

        cap = cv2.VideoCapture(clip_path)
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frames.append(frame[:, :, ::-1])  # BGR to RGB
        cap.release()

        if len(frames) < clip_len:
            frames += [frames[-1]] * (clip_len - len(frames))
        frames = frames[:clip_len]

        inputs = self.feature_extractor(images=frames, return_tensors="pt")["pixel_values"].squeeze(0)
        return inputs, torch.tensor(row.sentiment_score, dtype=torch.float32)

# ---- LOSS ----
class CenteredWeightedMSELoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, preds, targets):
        ideal = torch.zeros_like(targets)
        ideal[targets < -0.3] = -3.0
        ideal[targets > 0.3] = 3.0
        ideal[(-0.3 <= targets) & (targets <= 0.3)] = 0.0

        weights = torch.ones_like(targets)
        weights[targets < -0.3] = 2.0
        weights[targets > 0.3] = 2.0
        weights[(-0.3 <= targets) & (targets <= 0.3)] = 1.0

        mse = (preds - ideal) ** 2
        return (weights * mse).mean()

# ---- MODEL ----
class SentimentRegressor(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.regressor = nn.Sequential(
            nn.Linear(feature_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.regressor(x).squeeze(1)

# ---- TRAINING LOOP ----
def run_epoch(model, loader, optimizer, is_train=True):
    model.train() if is_train else model.eval()
    total_preds, total_labels = [], []
    total_loss = 0

    for clips, targets in tqdm(loader, leave=False):
        clips, targets = clips.to(device), targets.to(device)

        with torch.set_grad_enabled(is_train):
            features = video_mae(clips).last_hidden_state.mean(dim=1)
            preds = model(features)
            loss = loss_fn(preds, targets)

            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        total_loss += loss.item()
        total_preds.extend(preds.detach().cpu().numpy())
        total_labels.extend(targets.detach().cpu().numpy())

    return total_loss / len(loader), np.array(total_preds), np.array(total_labels)

def evaluate(preds, labels):
    def to_label(x):
        return "Negative" if x < -0.3 else "Positive" if x > 0.3 else "Neutral"
    preds_label = [to_label(p) for p in preds]
    labels_label = [to_label(l) for l in labels]

    macro_f1 = f1_score(labels_label, preds_label, average="macro")
    micro_f1 = f1_score(labels_label, preds_label, average="micro")
    recall = recall_score(labels_label, preds_label, average=None, labels=["Negative", "Neutral", "Positive"])
    acc = accuracy_score(labels_label, preds_label)
    return macro_f1, micro_f1, recall, acc

# ---- MAIN ----
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
video_mae = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base").to(device)
video_mae.eval()
for param in video_mae.parameters():
    param.requires_grad = False

full_dataset = VideoClipDataset(clip_dir, mapping_csv, feature_extractor)
train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

regressor = SentimentRegressor(feature_dim=768).to(device)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(regressor.parameters(), lr=2e-4)

# ---- TRAIN ----
best_macro_f1 = -np.inf
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch}")
    train_loss, _, _ = run_epoch(regressor, train_loader, optimizer, is_train=True)
    val_loss, val_preds, val_labels = run_epoch(regressor, val_loader, optimizer, is_train=False)

    macro_f1, micro_f1, recall, acc = evaluate(val_preds, val_labels)
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"Macro-F1: {macro_f1:.4f} | Micro-F1: {micro_f1:.4f} | Acc: {acc:.4f} | Recall: {recall}")

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        torch.save(regressor.state_dict(), "best_regressor_MSE.pth")
        print(f"✅ Best model saved at epoch {epoch} with Macro-F1={macro_f1:.4f}")

# ---- EVALUATE TEST ----
test_loss, test_preds, test_labels = run_epoch(regressor, test_loader, optimizer, is_train=False)
macro_f1, micro_f1, recall, acc = evaluate(test_preds, test_labels)
print("\n----- TEST RESULTS -----")
print(f"Macro-F1: {macro_f1:.4f} | Micro-F1: {micro_f1:.4f} | Acc: {acc:.4f} | Recall: {recall}")



TypeError: object of type 'map' has no len()