In [1]:
import pickle

# Path to the pooled features pickle file
pickle_path = 'pooled_video_features_by_video_id.pkl'  # Change path if needed

# Load the mapping from video ID to pooled feature vector
with open(pickle_path, 'rb') as f:
    video_feature_mapping = pickle.load(f)

# Example: Accessing the data
print(f"Loaded {len(video_feature_mapping)} video feature entries.\n")

# Iterate and show the shape of the first few mappings
for i, (video_id, feature_vector) in enumerate(video_feature_mapping.items()):
    print(f"Video ID: {video_id}")
    print(f"Feature vector shape: {feature_vector.shape}")
    print(f"First 10 feature values: {feature_vector[:10]}\n")
    
    if i >= 4:  # Only print first 5 entries
        break


Loaded 534 video feature entries.

Video ID: 1tuMFJnlXJc
Feature vector shape: (1068,)
First 10 feature values: [-0.06040457 -0.21081842 -0.22285981 -0.39252388 -0.18916583 -0.37512335
 -0.5231926   0.21505381  0.64515495  0.27315184]

Video ID: CZigX1ntOsI
Feature vector shape: (1068,)
First 10 feature values: [-0.06622387 -0.30856392 -0.2354024  -0.39381802 -0.14282663 -0.35739407
 -0.5427727   0.16601638  0.5981021   0.33628914]

Video ID: G4HMKRdIva0
Feature vector shape: (1068,)
First 10 feature values: [-0.06494846 -0.323554   -0.19728257 -0.37274173 -0.16677982 -0.2949614
 -0.5334089   0.13208969  0.5861574   0.24253434]

Video ID: e7_2U4lm6TE
Feature vector shape: (1068,)
First 10 feature values: [-0.05840817 -0.2742182  -0.19230546 -0.38841173 -0.13384297 -0.36795658
 -0.5203735   0.16043814  0.5822343   0.27939275]

Video ID: MOfEl_1dh-g
Feature vector shape: (1068,)
First 10 feature values: [-0.07361881 -0.30221224 -0.13909729 -0.33570156 -0.18234344 -0.3431642
 -0.4720386  

In [2]:
import torch

# Load the audio feature mapping from the .pt file
audio_dict = torch.load('audio_features_1068.pt')  # Update with your actual path

print(f"Loaded {len(audio_dict)} audio entries.\n")

for i, (video_key, content) in enumerate(audio_dict.items()):
    embedding = content['embedding']  # shape: e.g., torch.Size([512])
    label = content['label']          # shape: torch.Size([]) or int

    print(f"Audio Video Key: {video_key}")
    print(f"Embedding shape: {embedding.shape}")
    print(f"Label: {label.item() if isinstance(label, torch.Tensor) else label}")
    print(f"First 10 embedding values: {embedding[:10].tolist()}\n")

    if i >= 4:
        break


Loaded 755 audio entries.

Audio Video Key: AmGocfFQfVE
Embedding shape: torch.Size([1068])
Label: Negative
First 10 embedding values: [-0.07152270525693893, -0.04461353272199631, -0.24086812138557434, -0.4560454785823822, 0.35109537839889526, -0.3934575319290161, -0.1327846497297287, 0.3785174489021301, 0.2534216344356537, 0.761166512966156]

Audio Video Key: gR4gM_WXesQ
Embedding shape: torch.Size([1068])
Label: Positive
First 10 embedding values: [-0.004481568932533264, -0.2926996946334839, -0.5332130789756775, -0.16918739676475525, 0.6564330458641052, 0.0020235180854797363, -0.36786654591560364, 0.685794472694397, -0.035538747906684875, 0.8063980340957642]

Audio Video Key: kZfcQ4a0kx4
Embedding shape: torch.Size([1068])
Label: Positive
First 10 embedding values: [0.11859123408794403, -0.2315576821565628, -0.28997802734375, -0.4340461194515228, 0.7970811128616333, -0.38639193773269653, -0.04784702509641647, 0.38446053862571716, 0.22380605340003967, 0.80819171667099]

Audio Video Ke

  audio_dict = torch.load('audio_features_1068.pt')  # Update with your actual path


In [3]:
import pickle
import torch
import numpy as np

# Load video features (np.array)
with open('pooled_video_features_by_video_id.pkl', 'rb') as f:
    video_dict = pickle.load(f)  # dict[video_id] = np.array(1068,)

# Load audio features (torch.Tensor)
audio_dict = torch.load('audio_features_1068.pt')  # dict[video_id]['embedding'] = Tensor

# Load text features (.npy as dict)
text_dict = np.load('video_segment_word_embeddings.npy', allow_pickle=True).item()  # dict[video_id] = np.array(768,)

# Find common video IDs
common_ids = set(video_dict.keys()) & set(audio_dict.keys()) & set(text_dict.keys())
print(f"Found {len(common_ids)} common video IDs across video, audio, and text modalities.\n")

# Example: Check one aligned set
for vid in sorted(common_ids):
    video_feat = video_dict[vid]                                      # shape: (1068,)
    audio_feat = audio_dict[vid]['embedding'].detach().numpy()       # shape: (512,)
    text_feat = np.array(text_dict[vid])                              # convert list to array

    print(f"Video ID: {vid}")
    print(f"  Video feat shape: {video_feat.shape}")
    print(f"  Audio feat shape: {audio_feat.shape}")
    print(f"  Text  feat shape: {text_feat.shape}")
    break


Found 534 common video IDs across video, audio, and text modalities.

Video ID: -3g5yACwYnA
  Video feat shape: (1068,)
  Audio feat shape: (1068,)
  Text  feat shape: (1, 1068)


  audio_dict = torch.load('audio_features_1068.pt')  # dict[video_id]['embedding'] = Tensor


In [5]:
import pandas as pd

# Load the CSV file
csv_path = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Labels/new_sentiment_split_2.csv"  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)

# Count unique video IDs
unique_video_ids = df['video_id'].nunique()
print(f"Number of unique video IDs: {unique_video_ids}")


Number of unique video IDs: 755


In [2]:
import torch
import pickle
import numpy as np
import pandas as pd

# Load input files
with open("pooled_video_features_by_video_id.pkl", "rb") as f:
    video_dict = pickle.load(f)

audio_dict = torch.load("audio_features_1068.pt")
text_dict = np.load("/data/home/huixian/Documents/Homeworks/535_project/mosei_code/video_segment_word_embeddings.npy", allow_pickle=True).item()

# Load split mapping from CSV
csv_path = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Labels/new_sentiment_split_2.csv"
df = pd.read_csv(csv_path)
split_map = df.drop_duplicates(subset='video_id')[['video_id', 'split']].set_index('video_id')['split'].to_dict()

# Build fused features [(video_id, torch.tensor([video, audio, text]), label, split)]
fused_data = []
for vid in set(video_dict) & set(audio_dict) & set(text_dict) & set(split_map):
    try:
        video_feat = np.array(video_dict[vid])
        audio_feat = audio_dict[vid]['embedding'].detach().numpy()
        text_feat = np.array(text_dict[vid]).squeeze()

        fused_tensor = torch.tensor(np.stack([video_feat, audio_feat, text_feat], axis=0), dtype=torch.float32)

        label = audio_dict[vid]['label']
        if isinstance(label, torch.Tensor):
            label = label.item()

        split = split_map[vid]
        fused_data.append((vid, fused_tensor, label, split))
    except Exception as e:
        print(f"⚠️ Skipping {vid} due to error: {e}")

# Summary
print(f"✅ Total samples fused: {len(fused_data)}")
for s in ['train', 'val', 'test']:
    count = sum(1 for item in fused_data if item[3] == s)
    print(f"{s}: {count}")

# Save to pickle for reuse
with open("fused_tensor_data.pkl", "wb") as f:
    pickle.dump(fused_data, f)


✅ Total samples fused: 755
train: 534
val: 107
test: 114


  audio_dict = torch.load("audio_features_1068.pt")


In [13]:
import os
gpu_ids = [4]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_ids))
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pickle
from sklearn.metrics import f1_score, accuracy_score, recall_score
import numpy as np
import random

# -------------------------------
# Set Seed for Reproducibility
# -------------------------------

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# -------------------------------
# Label mapping
# -------------------------------

label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}

# -------------------------------
# Dataset
# -------------------------------

class FusedDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        _, feature, label, _ = self.data[idx]

        if isinstance(label, str):
            label = label_map[label]
        elif isinstance(label, float):
            label = 0 if label < 0 else 2 if label > 0 else 1

        return feature, label

    def __len__(self):
        return len(self.data)

def prepare_dataloaders(pickle_path, batch_size=32):
    with open(pickle_path, 'rb') as f:
        data = pickle.load(f)

    splits = {'train': [], 'val': [], 'test': []}
    for item in data:
        splits[item[3]].append(item)

    train_loader = DataLoader(FusedDataset(splits['train']), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(FusedDataset(splits['val']), batch_size=batch_size)
    test_loader = DataLoader(FusedDataset(splits['test']), batch_size=batch_size)
    return train_loader, val_loader, test_loader

# -------------------------------
# Model
# -------------------------------

class EarlyFusionTransformer(nn.Module):
    def __init__(self, input_dim=1068, num_classes=3, num_heads=4, hidden_dim=512):
        super().__init__()
        self.pos_embed = nn.Parameter(torch.zeros(1, 3, input_dim))
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=0.1, batch_first=True),
            num_layers=2
        )
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        x = x + self.pos_embed
        x = self.encoder(x)
        x = x.mean(dim=1)
        return self.classifier(x)

# -------------------------------
# Evaluation
# -------------------------------

def evaluate(model, loader, device, criterion):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0
    with torch.no_grad():
        for feats, labels in loader:
            feats = feats.to(device).view(feats.size(0), 3, 1068)
            labels = labels.clone().detach().to(device)

            logits = model(feats)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

    y_true = torch.cat(all_labels).numpy()
    y_pred = torch.cat(all_preds).numpy()

    metrics = {
        'loss': total_loss / len(loader),
        'micro_f1': f1_score(y_true, y_pred, average='micro'),
        'macro_f1': f1_score(y_true, y_pred, average='macro'),
        'accuracy': accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred, average=None)
    }
    return metrics

# -------------------------------
# Training Loop
# -------------------------------

def train_model():
    set_seed(42)
    train_loader, val_loader, test_loader = prepare_dataloaders('fused_tensor_data.pkl')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = EarlyFusionTransformer().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    best_val_macro_f1 = 0.0
    best_model_state = None

    for epoch in range(1, 101):
        model.train()
        total_loss = 0.0
        all_preds, all_labels = [], []

        for feats, labels in train_loader:
            feats = feats.to(device).view(feats.size(0), 3, 1068)
            labels = labels.clone().detach().to(device)

            optimizer.zero_grad()
            logits = model(feats)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

        y_train_true = torch.cat(all_labels).numpy()
        y_train_pred = torch.cat(all_preds).numpy()

        train_loss = total_loss / len(train_loader)
        train_metrics = {
            'macro_f1': f1_score(y_train_true, y_train_pred, average='macro'),
            'micro_f1': f1_score(y_train_true, y_train_pred, average='micro'),
            'accuracy': accuracy_score(y_train_true, y_train_pred),
            'recall': recall_score(y_train_true, y_train_pred, average=None)
        }

        val_metrics = evaluate(model, val_loader, device, criterion)

        print(f"\nEpoch {epoch}")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_metrics['loss']:.4f}")
        print(f"Macro-F1: {val_metrics['macro_f1']:.4f} | Micro-F1: {val_metrics['micro_f1']:.4f} | Acc: {val_metrics['accuracy']:.4f} | Recall: {val_metrics['recall']}")

        if val_metrics['macro_f1'] > best_val_macro_f1:
            best_val_macro_f1 = val_metrics['macro_f1']
            best_model_state = model.state_dict()
            print("✅New best model saved.")

    # Final evaluation
    model.load_state_dict(best_model_state)
    test_metrics = evaluate(model, test_loader, device, criterion)

    print("\nFINAL TEST EVALUATION")
    print(f"Test Loss : {test_metrics['loss']:.4f}")
    print(f"Macro-F1  : {test_metrics['macro_f1']:.4f}")
    print(f"Micro-F1  : {test_metrics['micro_f1']:.4f}")
    print(f"Accuracy  : {test_metrics['accuracy']:.4f}")
    print(f"Recall    : {test_metrics['recall']}")

    torch.save(model.state_dict(), "best_early_fusion_model.pth")

if __name__ == "__main__":
    train_model()



Epoch 1
Train Loss: 0.9663 | Val Loss: 0.9873
Macro-F1: 0.2786 | Micro-F1: 0.6168 | Acc: 0.6168 | Recall: [0.         0.04761905 0.91549296]
✅New best model saved.

Epoch 2
Train Loss: 0.9276 | Val Loss: 0.9256
Macro-F1: 0.2659 | Micro-F1: 0.6636 | Acc: 0.6636 | Recall: [0. 0. 1.]

Epoch 3
Train Loss: 0.9283 | Val Loss: 0.9317
Macro-F1: 0.2659 | Micro-F1: 0.6636 | Acc: 0.6636 | Recall: [0. 0. 1.]

Epoch 4
Train Loss: 0.8941 | Val Loss: 0.9305
Macro-F1: 0.2614 | Micro-F1: 0.6449 | Acc: 0.6449 | Recall: [0.         0.         0.97183099]

Epoch 5
Train Loss: 0.8939 | Val Loss: 0.9582
Macro-F1: 0.2971 | Micro-F1: 0.5794 | Acc: 0.5794 | Recall: [0.         0.14285714 0.83098592]
✅New best model saved.

Epoch 6
Train Loss: 0.9051 | Val Loss: 0.9612
Macro-F1: 0.3074 | Micro-F1: 0.5140 | Acc: 0.5140 | Recall: [0.         0.33333333 0.67605634]
✅New best model saved.

Epoch 7
Train Loss: 0.8737 | Val Loss: 0.9493
Macro-F1: 0.3123 | Micro-F1: 0.5607 | Acc: 0.5607 | Recall: [0.         0.238095

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pickle
from sklearn.metrics import f1_score, accuracy_score, recall_score
import numpy as np
import random

# -------------------------------
# Set Seed
# -------------------------------

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# -------------------------------
# Dataset
# -------------------------------

class ConcatFusedDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}

    def __getitem__(self, idx):
        video_id, fused_tensor, label, split = self.data[idx]

        if isinstance(label, str):
            label = self.label_map[label]
        elif isinstance(label, float):
            label = 0 if label < 0 else 2 if label > 0 else 1

        # Flatten tensor: shape [3, D] -> [3*D]
        return fused_tensor.flatten(), torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.data)



# -------------------------------
# Dataloader Preparation
# -------------------------------

def prepare_dataloaders(pickle_path, batch_size=32):
    # Load fused data
    with open(pickle_path, 'rb') as f:
        data = pickle.load(f)

    # Organize by split
    splits = {'train': [], 'val': [], 'test': []}
    for item in data:
        video_id, fused_tensor, label, split = item
        if split in splits:
            splits[split].append(item)

    print("📊 Split sizes:", {k: len(v) for k, v in splits.items()})

    if len(splits['train']) == 0:
        raise ValueError("🚨 Training split is empty. Verify the pickle includes split info.")

    train_loader = DataLoader(ConcatFusedDataset(splits['train']), batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(ConcatFusedDataset(splits['val']), batch_size=batch_size)
    test_loader  = DataLoader(ConcatFusedDataset(splits['test']), batch_size=batch_size)
    return train_loader, val_loader, test_loader

# -------------------------------
# Simple Concatenation MLP Model
# -------------------------------

class SimpleConcatMLP(nn.Module):
    def __init__(self, input_dim=3204, hidden_dim=512, num_classes=3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        return self.model(x)



# -------------------------------
# Evaluation Function
# -------------------------------

def evaluate(model, loader, device, criterion):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for feats, labels in loader:
            feats = feats.to(device)
            labels = labels.to(device)

            logits = model(feats)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

    y_true = torch.cat(all_labels).numpy()
    y_pred = torch.cat(all_preds).numpy()

    metrics = {
        'loss': total_loss / len(loader),
        'micro_f1': f1_score(y_true, y_pred, average='micro'),
        'macro_f1': f1_score(y_true, y_pred, average='macro'),
        'accuracy': accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred, average=None)
    }
    return metrics

# -------------------------------
# Training Loop
# -------------------------------

def train_model():
    set_seed(42)
    pickle_path = "fused_tensor_data.pkl"
    train_loader, val_loader, test_loader = prepare_dataloaders(pickle_path, batch_size=32)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SimpleConcatMLP(input_dim=3204).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    best_val_macro_f1 = 0.0
    best_model_state = None

    for epoch in range(1, 301):
        model.train()
        total_loss = 0.0
        all_preds, all_labels = [], []

        for feats, labels in train_loader:
            feats = feats.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            logits = model(feats)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

        y_train_true = torch.cat(all_labels).numpy()
        y_train_pred = torch.cat(all_preds).numpy()

        train_loss = total_loss / len(train_loader)
        train_metrics = {
            'macro_f1': f1_score(y_train_true, y_train_pred, average='macro'),
            'micro_f1': f1_score(y_train_true, y_train_pred, average='micro'),
            'accuracy': accuracy_score(y_train_true, y_train_pred),
            'recall': recall_score(y_train_true, y_train_pred, average=None)
        }

        val_metrics = evaluate(model, val_loader, device, criterion)

        print(f"\nEpoch {epoch}")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_metrics['loss']:.4f}")
        print(f"Macro-F1: {val_metrics['macro_f1']:.4f} | Micro-F1: {val_metrics['micro_f1']:.4f} | Acc: {val_metrics['accuracy']:.4f} | Recall: {val_metrics['recall']}")

        if val_metrics['macro_f1'] > best_val_macro_f1:
            best_val_macro_f1 = val_metrics['macro_f1']
            best_model_state = model.state_dict()
            print("✅ New best model saved.")

    # Final test eval
    model.load_state_dict(best_model_state)
    test_metrics = evaluate(model, test_loader, device, criterion)

    print("\n🎯 FINAL TEST EVALUATION")
    print(f"Test Loss : {test_metrics['loss']:.4f}")
    print(f"Macro-F1  : {test_metrics['macro_f1']:.4f}")
    print(f"Micro-F1  : {test_metrics['micro_f1']:.4f}")
    print(f"Accuracy  : {test_metrics['accuracy']:.4f}")
    print(f"Recall    : {test_metrics['recall']}")

    torch.save(model.state_dict(), "best_concat_model.pth")
    print("💾 Model saved to best_concat_model.pth")

if __name__ == "__main__":
    train_model()


📊 Split sizes: {'train': 534, 'val': 107, 'test': 114}

Epoch 1
Train Loss: 0.9748 | Val Loss: 0.9281
Macro-F1: 0.2659 | Micro-F1: 0.6636 | Acc: 0.6636 | Recall: [0. 0. 1.]
✅ New best model saved.

Epoch 2
Train Loss: 0.9228 | Val Loss: 0.9156
Macro-F1: 0.2659 | Micro-F1: 0.6636 | Acc: 0.6636 | Recall: [0. 0. 1.]

Epoch 3
Train Loss: 0.9190 | Val Loss: 0.9383
Macro-F1: 0.2590 | Micro-F1: 0.6355 | Acc: 0.6355 | Recall: [0.         0.         0.95774648]

Epoch 4
Train Loss: 0.8974 | Val Loss: 0.9390
Macro-F1: 0.3129 | Micro-F1: 0.6542 | Acc: 0.6542 | Recall: [0.         0.0952381  0.95774648]
✅ New best model saved.

Epoch 5
Train Loss: 0.8925 | Val Loss: 0.9239
Macro-F1: 0.2637 | Micro-F1: 0.6542 | Acc: 0.6542 | Recall: [0.         0.         0.98591549]

Epoch 6
Train Loss: 0.8892 | Val Loss: 0.9395
Macro-F1: 0.3272 | Micro-F1: 0.6168 | Acc: 0.6168 | Recall: [0.         0.19047619 0.87323944]
✅ New best model saved.

Epoch 7
Train Loss: 0.8928 | Val Loss: 0.9416
Macro-F1: 0.3272 | Mic

In [20]:
import pickle
import pandas as pd

# --- Paths ---
pickle_path = 'fused_tensor_data.pkl'
csv_path = "/data/home/huixian/Documents/Homeworks/535_project/MOSEI-Seg/Labels/new_sentiment_split_2.csv"

# --- Load pickle ---
with open(pickle_path, 'rb') as f:
    data = pickle.load(f)
pickle_base_videos = set(item[-1] for item in data)
print(f"📦 Pickle: {len(pickle_base_videos)} unique base_video values")

# --- Load CSV ---
df = pd.read_csv(csv_path)
csv_base_videos = set(df['base_video'].unique())
print(f"📄 CSV: {len(csv_base_videos)} unique base_video values")

# --- Compare ---
intersection = pickle_base_videos & csv_base_videos
only_in_pickle = pickle_base_videos - csv_base_videos
only_in_csv = csv_base_videos - pickle_base_videos

print(f"✅ Matched base_video entries: {len(intersection)}")
print(f"❌ In pickle but not in CSV: {len(only_in_pickle)}")
print(f"❌ In CSV but not in pickle: {len(only_in_csv)}")

# --- Show sample mismatches ---
print("\n🔍 Sample mismatches (in pickle only):")
for i, item in enumerate(sorted(only_in_pickle)):
    print(f"  [Pickle] {item}")
    if i >= 9: break

print("\n🔍 Sample mismatches (in CSV only):")
for i, item in enumerate(sorted(only_in_csv)):
    print(f"  [CSV] {item}")
    if i >= 9: break


📦 Pickle: 3 unique base_video values
📄 CSV: 755 unique base_video values
✅ Matched base_video entries: 0
❌ In pickle but not in CSV: 3
❌ In CSV but not in pickle: 755

🔍 Sample mismatches (in pickle only):
  [Pickle] test
  [Pickle] train
  [Pickle] val

🔍 Sample mismatches (in CSV only):
  [CSV] -3g5yACwYnA
  [CSV] -3nNcZdcdvU
  [CSV] -HeZS2-Prhc
  [CSV] -I_e4mIh0yE
  [CSV] -MeTTeMJBNc
  [CSV] -THoVjtIkeU
  [CSV] -UUCSKoHeMA
  [CSV] -ZgjBOA1Yhw
  [CSV] -aqamKhZ1Ec
  [CSV] -dxfTGcXJoc
