In [2]:
!pip install -U gdown

import gdown

# 第一个文件
url1 = "https://drive.google.com/uc?id=1PbdlIg0p8Lm8HO2Wv17b5vzelXONpbuf"
output1 = "memes.zip"  # 你可以改成实际文件名或路径
gdown.download(url1, output1, quiet=False)

# 第二个文件
url2 = "https://drive.google.com/uc?id=1WhUQxJ2b1SjY5geBmthr8CvnR2GTt5l4"
output2 = "processed_data_all_labels.csv"
gdown.download(url2, output2, quiet=False)

# Unzip the downloaded file file1.ext
import zipfile
import os

# Specify the zip file path
zip_path = '/content/memes.zip' # Path to file1.ext
extract_dir = '/content/'  # Extract to Colab local storage (assuming this contains the images)

# Create extraction directory
os.makedirs(extract_dir, exist_ok=True)

# Check if the zip file exists
if not os.path.exists(zip_path):
    print(f"❌ Error: Zip file not found at {zip_path}")
else:
    # Extract the zip file
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"✅ Extraction complete for {os.path.basename(zip_path)}. Files are located at: {extract_dir}")
    except zipfile.BadZipFile:
        print(f"❌ Error: File at {zip_path} is not a valid zip file.")
    except Exception as e:
        print(f"❌ An unexpected error occurred during extraction of {os.path.basename(zip_path)}: {e}")



Downloading...
From (original): https://drive.google.com/uc?id=1PbdlIg0p8Lm8HO2Wv17b5vzelXONpbuf
From (redirected): https://drive.google.com/uc?id=1PbdlIg0p8Lm8HO2Wv17b5vzelXONpbuf&confirm=t&uuid=6027f07b-8709-40e1-ae02-086783c0b469
To: /content/memes.zip
100%|██████████| 557M/557M [00:02<00:00, 267MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WhUQxJ2b1SjY5geBmthr8CvnR2GTt5l4
To: /content/processed_data_all_labels.csv
100%|██████████| 1.04M/1.04M [00:00<00:00, 152MB/s]


✅ Extraction complete for memes.zip. Files are located at: /content/


In [4]:
# --------------------------------------------------------------
# 1. 基础库 & 设备
# --------------------------------------------------------------
import os, ast, torch, numpy as np, pandas as pd
from tqdm import tqdm
from PIL import Image
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from sentence_transformers import SentenceTransformer
from transformers import RobertaModel, RobertaTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import lightgbm as lgb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --------------------------------------------------------------
# 2. 数据集定义（和你的原始代码完全一致）
# --------------------------------------------------------------
class MemeDataset(Dataset):
    def __init__(self, csv_file, image_dir):
        self.df = pd.read_csv(csv_file)
        self.image_dir = image_dir

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        img_id = str(self.df.iloc[idx]['id'])
        # 尝试多种后缀
        for ext in ['.jpeg', '.jpg', '.png', '.JPEG', '.JPG', '.PNG']:
            path = os.path.join(self.image_dir, img_id + ext)
            if os.path.exists(path):
                img = Image.open(path).convert("RGB")
                break
        else:
            raise FileNotFoundError(f"Image not found: {img_id}")

        text = self.df.iloc[idx]['text']

        # ---- task4_hard 是二分类 one-hot ----
        hard = ast.literal_eval(self.df.iloc[idx]['task4_hard'])
        label = torch.tensor(hard.index(1.0), dtype=torch.float)   # 0 or 1

        return {'image': img, 'text': text, 'label': label}

def collate_fn(batch):
    return {
        'image': [b['image'] for b in batch],
        'text' : [b['text']  for b in batch],
        'label': torch.stack([b['label'] for b in batch])
    }

# --------------------------------------------------------------
# 3. 加载数据 & 划分
# --------------------------------------------------------------
CSV_PATH   = 'processed_data_all_labels.csv'
IMAGE_DIR  = '/content/memes/'          # 你的图片所在文件夹
BATCH_SIZE = 16

full_ds = MemeDataset(CSV_PATH, IMAGE_DIR)
train_sz = int(0.8 * len(full_ds))
val_sz   = int(0.1 * len(full_ds))
test_sz  = len(full_ds) - train_sz - val_sz
train_ds, val_ds, test_ds = random_split(full_ds, [train_sz, val_sz, test_sz])

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True , collate_fn=collate_fn)
test_loader  = DataLoader(test_ds , batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Train/Val/Test: {train_sz}/{val_sz}/{test_sz}")



# --------------------------------------------------------------
# 4. 多模态特征提取（CLIP + RoBERTa）—— 修复设备不一致
# --------------------------------------------------------------
clip_model   = SentenceTransformer('clip-ViT-B-32')
roberta      = RobertaModel.from_pretrained('roberta-base').to(device)   # ← 移动到 GPU
tokenizer    = RobertaTokenizer.from_pretrained('roberta-base')
roberta.eval()
clip_model.eval()

def extract_features(loader):
    img_embs, txt_embs, lbls = [], [], []
    for batch in tqdm(loader, desc="Extracting"):
        imgs = batch['image']
        txts = batch['text']
        lbl  = batch['label'].to(device)

        # ---- CLIP 图像 ----
        img_feat = clip_model.encode(
            imgs,
            convert_to_tensor=True,
            show_progress_bar=False,
            device=device  # ← 关键！指定设备
        )

        # ---- RoBERTa 文本 ----
        enc = tokenizer(
            txts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        # 确保所有 tensor 都在同一个设备
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            txt_feat = roberta(**enc).last_hidden_state.mean(dim=1)   # [B, 768]

        img_embs.append(img_feat)
        txt_embs.append(txt_feat)
        lbls.append(lbl)

    img_all = torch.cat(img_embs, dim=0)
    txt_all = torch.cat(txt_embs, dim=0)
    lbl_all = torch.cat(lbls, dim=0)
    return torch.cat([img_all, txt_all], dim=1), lbl_all

print("\n--- Extracting TRAIN features ---")
train_features, train_labels = extract_features(train_loader)

print("\n--- Extracting TEST features ---")
test_features , test_labels  = extract_features(test_loader)

print(f"train_features: {train_features.shape}, train_labels: {train_labels.shape}")
print(f"test_features : {test_features.shape},  test_labels : {test_labels.shape}")

# --------------------------------------------------------------
# 5. 特征标准化（对 FC 和 LightGBM 都至关重要）
# --------------------------------------------------------------
scaler = StandardScaler()
train_feat_np = train_features.cpu().numpy()
test_feat_np  = test_features.cpu().numpy()

train_feat_scaled = scaler.fit_transform(train_feat_np)
test_feat_scaled  = scaler.transform(test_feat_np)

train_features = torch.FloatTensor(train_feat_scaled).to(device)
test_features  = torch.FloatTensor(test_feat_scaled ).to(device)

print("Features standardized")

# --------------------------------------------------------------
# 6. 更强的 Fully-Connected 模型
# --------------------------------------------------------------
class StrongFC(nn.Module):
    def __init__(self, input_dim, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 128),       nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout*0.8),
            nn.Linear(128, 64),        nn.BatchNorm1d(64),  nn.ReLU(), nn.Dropout(dropout*0.6),
            nn.Linear(64, 1)                                      # 二分类 logits
        )
    def forward(self, x): return self.net(x)

fc_model = StrongFC(train_features.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(fc_model.parameters(), lr=1e-3, weight_decay=1e-5)

train_ds_fc = TensorDataset(train_features, train_labels.to(device))
test_ds_fc  = TensorDataset(test_features , test_labels .to(device))

train_loader_fc = DataLoader(train_ds_fc, batch_size=32, shuffle=True)
test_loader_fc  = DataLoader(test_ds_fc , batch_size=32, shuffle=False)

# --------------------------------------------------------------
# 7. 训练 StrongFC（带简单早停）
# --------------------------------------------------------------
print("\n--- Training StrongFC ---")
fc_model.train()
EPOCHS = 50
best_val_acc = 0.0
patience = 7
no_imp = 0

for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for x, y in train_loader_fc:
        optimizer.zero_grad()
        logits = fc_model(x).squeeze(1)
        loss   = criterion(logits, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # ---- 验证（使用最后 10% 的训练集做快速验证）----
    fc_model.eval()
    val_preds, val_true = [], []
    with torch.no_grad():
        for i, (x, y) in enumerate(train_loader_fc):
            if i > len(train_loader_fc) * 0.9:   # 最后 10%
                prob = torch.sigmoid(fc_model(x)).cpu().numpy()
                val_preds.extend((prob > 0.5).astype(int).flatten())
                val_true.extend(y.cpu().numpy().astype(int))
    val_acc = accuracy_score(val_true, val_preds)
    fc_model.train()

    print(f"Epoch {epoch+1:2d} | Loss {epoch_loss/len(train_loader_fc):.4f} | ValAcc {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(fc_model.state_dict(), 'best_fc.pth')
        no_imp = 0
    else:
        no_imp += 1
        if no_imp >= patience:
            print("Early stopping triggered")
            break

fc_model.load_state_dict(torch.load('best_fc.pth'))
print(f"StrongFC 训练结束，Best Val Acc: {best_val_acc:.4f}")

# --------------------------------------------------------------
# 8. 调优 LightGBM
# --------------------------------------------------------------
print("\n--- Training Tuned LightGBM ---")
lgb_train = lgb.Dataset(train_feat_scaled, label=train_labels.cpu().numpy())

params = {
    'objective'       : 'binary',
    'metric'          : 'binary_logloss',
    'boosting_type'   : 'gbdt',
    'num_leaves'      : 64,
    'max_depth'       : 10,
    'learning_rate'   : 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq'    : 5,
    'verbose'         : -1,
    'seed'            : 42
}

lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print("LightGBM 训练完成")

# --------------------------------------------------------------
# 9. 集成预测（FC + LightGBM 软投票）
# --------------------------------------------------------------
print("\n--- Ensemble (FC + LightGBM) ---")
fc_model.eval()
fc_probs = []
with torch.no_grad():
    for x, _ in test_loader_fc:
        prob = torch.sigmoid(fc_model(x)).cpu().numpy().flatten()
        fc_probs.extend(prob)

lgb_probs = lgb_model.predict(test_feat_scaled)

ensemble_prob = (np.array(fc_probs) + np.array(lgb_probs)) / 2
ensemble_pred = (ensemble_prob > 0.5).astype(int)

ens_acc = accuracy_score(test_labels.cpu().numpy(), ensemble_pred)
fc_acc  = accuracy_score(test_labels.cpu().numpy(), (np.array(fc_probs) > 0.5).astype(int))
lgb_acc = accuracy_score(test_labels.cpu().numpy(), (np.array(lgb_probs) > 0.5).astype(int))

print("\nFINAL RESULTS")
print(f"   StrongFC alone : {fc_acc:.4f}")
print(f"   LightGBM alone : {lgb_acc:.4f}")
print(f"   ENSEMBLE       : {ens_acc:.4f}  (↑)")

# --------------------------------------------------------------
# 10. （可选）保存模型供后续推理
# --------------------------------------------------------------
torch.save(fc_model.state_dict(), 'fc_final.pth')
lgb_model.save_model('lgb_final.txt')
print("\n模型已保存：fc_final.pth  &  lgb_final.txt")

Using device: cuda
Train/Val/Test: 3235/404/405


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Extracting TRAIN features ---


Extracting: 100%|██████████| 203/203 [01:09<00:00,  2.90it/s]



--- Extracting TEST features ---


Extracting: 100%|██████████| 26/26 [00:08<00:00,  2.96it/s]


train_features: torch.Size([3235, 1280]), train_labels: torch.Size([3235])
test_features : torch.Size([405, 1280]),  test_labels : torch.Size([405])
Features standardized

--- Training StrongFC ---
Epoch  1 | Loss 0.6834 | ValAcc 0.6735
Epoch  2 | Loss 0.6406 | ValAcc 0.7526
Epoch  3 | Loss 0.6089 | ValAcc 0.7354
Epoch  4 | Loss 0.5711 | ValAcc 0.7285
Epoch  5 | Loss 0.5508 | ValAcc 0.7526
Epoch  6 | Loss 0.5328 | ValAcc 0.8041
Epoch  7 | Loss 0.5125 | ValAcc 0.8316
Epoch  8 | Loss 0.4622 | ValAcc 0.8625
Epoch  9 | Loss 0.4614 | ValAcc 0.8419
Epoch 10 | Loss 0.4577 | ValAcc 0.8591
Epoch 11 | Loss 0.4240 | ValAcc 0.8935
Epoch 12 | Loss 0.4082 | ValAcc 0.8797
Epoch 13 | Loss 0.3779 | ValAcc 0.9038
Epoch 14 | Loss 0.3522 | ValAcc 0.9072
Epoch 15 | Loss 0.3532 | ValAcc 0.9381
Epoch 16 | Loss 0.3540 | ValAcc 0.9416
Epoch 17 | Loss 0.3436 | ValAcc 0.9210
Epoch 18 | Loss 0.3144 | ValAcc 0.9313
Epoch 19 | Loss 0.2881 | ValAcc 0.9519
Epoch 20 | Loss 0.3168 | ValAcc 0.9725
Epoch 21 | Loss 0.2971



[100]	training's binary_logloss: 0.125742
[200]	training's binary_logloss: 0.0325101
[300]	training's binary_logloss: 0.00978492
[400]	training's binary_logloss: 0.00363531
[500]	training's binary_logloss: 0.00163878
[600]	training's binary_logloss: 0.00109747
[700]	training's binary_logloss: 0.000939614
[800]	training's binary_logloss: 0.000913539
[900]	training's binary_logloss: 0.000887917
[1000]	training's binary_logloss: 0.000877412
LightGBM 训练完成

--- Ensemble (FC + LightGBM) ---

FINAL RESULTS
   StrongFC alone : 0.6148
   LightGBM alone : 0.6840
   ENSEMBLE       : 0.6741  (↑)

模型已保存：fc_final.pth  &  lgb_final.txt


In [5]:
from sklearn.metrics import f1_score
import numpy as np

# Assuming test_labels, fc_probs, lgb_probs, and ensemble_pred are available from the previous cell

# Calculate F1 scores
test_labels_np = test_labels.cpu().numpy()
fc_preds = (np.array(fc_probs) > 0.5).astype(int)
lgb_preds = (np.array(lgb_probs) > 0.5).astype(int)


ens_f1 = f1_score(test_labels_np, ensemble_pred)
fc_f1  = f1_score(test_labels_np, fc_preds)
lgb_f1 = f1_score(test_labels_np, lgb_preds)

print("\nF1 SCORES")
print(f"   StrongFC alone : {fc_f1:.4f}")
print(f"   LightGBM alone : {lgb_f1:.4f}")
print(f"   ENSEMBLE       : {ens_f1:.4f}")


F1 SCORES
   StrongFC alone : 0.5618
   LightGBM alone : 0.5789
   ENSEMBLE       : 0.5901


Below is the experiment on soft label

In [7]:
# --------------------------------------------------------------
# 1. 基础库 & 设备
# --------------------------------------------------------------
import os, ast, torch, numpy as np, pandas as pd
from tqdm import tqdm
from PIL import Image
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from sentence_transformers import SentenceTransformer
from transformers import RobertaModel, RobertaTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import lightgbm as lgb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --------------------------------------------------------------
# 2. 数据集定义（使用 soft label）
# --------------------------------------------------------------
class MemeDataset(Dataset):
    def __init__(self, csv_file, image_dir):
        self.df = pd.read_csv(csv_file)
        self.image_dir = image_dir

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        img_id = str(self.df.iloc[idx]['id'])
        for ext in ['.jpeg', '.jpg', '.png', '.JPEG', '.JPG', '.PNG']:
            path = os.path.join(self.image_dir, img_id + ext)
            if os.path.exists(path):
                img = Image.open(path).convert("RGB")
                break
        else:
            raise FileNotFoundError(f"Image not found: {img_id}")

        text = self.df.iloc[idx]['text']

        # ---- soft label ----
        soft = ast.literal_eval(self.df.iloc[idx]['task4_soft'])
        label = torch.tensor(soft[1], dtype=torch.float)  # [0,1] 之间的软标签

        return {'image': img, 'text': text, 'label': label}

def collate_fn(batch):
    return {
        'image': [b['image'] for b in batch],
        'text' : [b['text']  for b in batch],
        'label': torch.stack([b['label'] for b in batch])
    }

# --------------------------------------------------------------
# 3. 加载数据 & 划分
# --------------------------------------------------------------
CSV_PATH   = 'processed_data_all_labels.csv'
IMAGE_DIR  = '/content/memes/'
BATCH_SIZE = 16

full_ds = MemeDataset(CSV_PATH, IMAGE_DIR)
train_sz = int(0.8 * len(full_ds))
val_sz   = int(0.1 * len(full_ds))
test_sz  = len(full_ds) - train_sz - val_sz
train_ds, val_ds, test_ds = random_split(full_ds, [train_sz, val_sz, test_sz])

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True , collate_fn=collate_fn)
test_loader  = DataLoader(test_ds , batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Train/Val/Test: {train_sz}/{val_sz}/{test_sz}")

# --------------------------------------------------------------
# 4. 多模态特征提取（CLIP + RoBERTa）
# --------------------------------------------------------------
clip_model   = SentenceTransformer('clip-ViT-B-32').to(device) # Move CLIP model to device
roberta      = RobertaModel.from_pretrained('roberta-base').to(device) # Ensure Roberta is on the correct device
tokenizer    = RobertaTokenizer.from_pretrained('roberta-base')
roberta.eval()
clip_model.eval()

def extract_features(loader):
    img_embs, txt_embs, lbls = [], [], []
    for batch in tqdm(loader, desc="Extracting"):
        imgs = batch['image']
        txts = batch['text']
        lbl  = batch['label'].to(device)

        img_feat = clip_model.encode(imgs, convert_to_tensor=True, show_progress_bar=False).to(device) # Move image features to device after encoding

        enc = tokenizer(txts, padding=True, truncation=True, max_length=128, return_tensors='pt')
        enc = {k: v.to(device) for k, v in enc.items()} # Ensure input tensors are on the correct device
        with torch.no_grad():
            txt_feat = roberta(**enc).last_hidden_state.mean(dim=1)

        img_embs.append(img_feat)
        txt_embs.append(txt_feat)
        lbls.append(lbl)

    img_all = torch.cat(img_embs, dim=0)
    txt_all = torch.cat(txt_embs, dim=0)
    lbl_all = torch.cat(lbls, dim=0)
    return torch.cat([img_all, txt_all], dim=1), lbl_all

print("\n--- Extracting TRAIN features ---")
train_features, train_labels = extract_features(train_loader)
print("\n--- Extracting TEST features ---")
test_features , test_labels  = extract_features(test_loader)

print(f"train_features: {train_features.shape}, train_labels: {train_labels.shape}")
print(f"test_features : {test_features.shape},  test_labels : {test_labels.shape}")

# --------------------------------------------------------------
# 5. 特征标准化
# --------------------------------------------------------------
scaler = StandardScaler()
train_feat_np = train_features.cpu().numpy()
test_feat_np  = test_features.cpu().numpy()
train_feat_scaled = scaler.fit_transform(train_feat_np)
test_feat_scaled  = scaler.transform(test_feat_np)
train_features = torch.FloatTensor(train_feat_scaled).to(device)
test_features  = torch.FloatTensor(test_feat_scaled ).to(device)
print("Features standardized")

# --------------------------------------------------------------
# 6. Fully Connected 模型
# --------------------------------------------------------------
class StrongFC(nn.Module):
    def __init__(self, input_dim, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 128),       nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout*0.8),
            nn.Linear(128, 64),        nn.BatchNorm1d(64),  nn.ReLU(), nn.Dropout(dropout*0.6),
            nn.Linear(64, 1)
        )
    def forward(self, x): return self.net(x)

fc_model = StrongFC(train_features.shape[1]).to(device)

bce = nn.BCEWithLogitsLoss()
kl  = nn.KLDivLoss(reduction='batchmean')

def mixed_loss(logits, targets):
    prob = torch.sigmoid(logits)
    loss_bce = bce(logits, targets)
    loss_kl  = kl(prob.log(), targets)
    return 0.7 * loss_bce + 0.3 * loss_kl

optimizer = torch.optim.AdamW(fc_model.parameters(), lr=1e-3, weight_decay=1e-5)

train_ds_fc = TensorDataset(train_features, train_labels.to(device))
test_ds_fc  = TensorDataset(test_features , test_labels .to(device))
train_loader_fc = DataLoader(train_ds_fc, batch_size=32, shuffle=True)
test_loader_fc  = DataLoader(test_ds_fc , batch_size=32, shuffle=False)

# --------------------------------------------------------------
# 7. 训练 StrongFC
# --------------------------------------------------------------
print("\n--- Training StrongFC (Soft Label) ---")
fc_model.train()
EPOCHS = 50
best_val_acc = 0.0
patience = 7
no_imp = 0

for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for x, y in train_loader_fc:
        optimizer.zero_grad()
        logits = fc_model(x).squeeze(1)
        loss   = mixed_loss(logits, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    fc_model.eval()
    preds, true = [], []
    with torch.no_grad():
        for x, y in test_loader_fc:
            prob = torch.sigmoid(fc_model(x)).cpu().numpy()
            preds.extend(prob.flatten())
            true.extend(y.cpu().numpy())
    fc_model.train()

    preds_bin = (np.array(preds) > 0.5).astype(int)
    true_bin  = (np.array(true) > 0.5).astype(int)
    val_acc = accuracy_score(true_bin, preds_bin)
    val_f1  = f1_score(true_bin, preds_bin, average='macro')

    print(f"Epoch {epoch+1:2d} | Loss {epoch_loss/len(train_loader_fc):.4f} | ValAcc {val_acc:.4f} | ValF1 {val_f1:.4f}")

    if val_f1 > best_val_acc:
        best_val_acc = val_f1
        torch.save(fc_model.state_dict(), 'best_fc_soft.pth')
        no_imp = 0
    else:
        no_imp += 1
        if no_imp >= patience:
            print("Early stopping triggered")
            break

fc_model.load_state_dict(torch.load('best_fc_soft.pth'))
print(f"StrongFC 训练结束，Best Val F1: {best_val_acc:.4f}")

# --------------------------------------------------------------
# 8. LightGBM (Soft label)
# --------------------------------------------------------------
print("\n--- Training LightGBM (Soft Label) ---")
lgb_train = lgb.Dataset(train_feat_scaled, label=train_labels.cpu().numpy())

params = {
    'objective'       : 'binary',
    'metric'          : 'binary_logloss',
    'boosting_type'   : 'gbdt',
    'num_leaves'      : 64,
    'max_depth'       : 10,
    'learning_rate'   : 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq'    : 5,
    'verbose'         : -1,
    'seed'            : 42
}

lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

# --------------------------------------------------------------
# 9. 集成评估
# --------------------------------------------------------------
print("\n--- Ensemble Evaluation (Soft Label) ---")
fc_model.eval()
fc_probs = []
with torch.no_grad():
    for x, _ in test_loader_fc:
        prob = torch.sigmoid(fc_model(x)).cpu().numpy().flatten()
        fc_probs.extend(prob)

lgb_probs = lgb_model.predict(test_feat_scaled)

ensemble_prob = (np.array(fc_probs) + np.array(lgb_probs)) / 2
ensemble_pred = (ensemble_prob > 0.5).astype(int)

# True labels (二值化)
y_true = (test_labels.cpu().numpy() > 0.5).astype(int)

# Metrics
def report(name, probs, preds):
    acc = accuracy_score(y_true, preds)
    f1  = f1_score(y_true, preds, average='macro')
    print(f"{name:<15} Acc: {acc:.4f} | F1: {f1:.4f}")

print("\nFINAL RESULTS (Soft Label Training)")
report("StrongFC", fc_probs, (np.array(fc_probs) > 0.5).astype(int))
report("LightGBM", lgb_probs, (np.array(lgb_probs) > 0.5).astype(int))
report("Ensemble", ensemble_prob, ensemble_pred)

torch.save(fc_model.state_dict(), 'fc_final_soft.pth')
lgb_model.save_model('lgb_final_soft.txt')
print("\n模型已保存：fc_final_soft.pth  &  lgb_final_soft.txt")

Using device: cuda
Train/Val/Test: 3235/404/405


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Extracting TRAIN features ---


Extracting: 100%|██████████| 203/203 [01:14<00:00,  2.71it/s]



--- Extracting TEST features ---


Extracting: 100%|██████████| 26/26 [00:09<00:00,  2.80it/s]


train_features: torch.Size([3235, 1280]), train_labels: torch.Size([3235])
test_features : torch.Size([405, 1280]),  test_labels : torch.Size([405])
Features standardized

--- Training StrongFC (Soft Label) ---
Epoch  1 | Loss 0.5157 | ValAcc 0.6000 | ValF1 0.5966
Epoch  2 | Loss 0.4902 | ValAcc 0.6049 | ValF1 0.5991
Epoch  3 | Loss 0.4827 | ValAcc 0.5975 | ValF1 0.5902
Epoch  4 | Loss 0.4732 | ValAcc 0.6198 | ValF1 0.6162
Epoch  5 | Loss 0.4630 | ValAcc 0.6247 | ValF1 0.6166
Epoch  6 | Loss 0.4556 | ValAcc 0.6395 | ValF1 0.6300
Epoch  7 | Loss 0.4508 | ValAcc 0.6420 | ValF1 0.6296
Epoch  8 | Loss 0.4443 | ValAcc 0.6370 | ValF1 0.6231
Epoch  9 | Loss 0.4415 | ValAcc 0.6420 | ValF1 0.6282
Epoch 10 | Loss 0.4379 | ValAcc 0.6198 | ValF1 0.6141
Epoch 11 | Loss 0.4315 | ValAcc 0.6469 | ValF1 0.6367
Epoch 12 | Loss 0.4219 | ValAcc 0.6346 | ValF1 0.6243
Epoch 13 | Loss 0.4239 | ValAcc 0.6444 | ValF1 0.6356
Epoch 14 | Loss 0.4154 | ValAcc 0.6395 | ValF1 0.6294
Epoch 15 | Loss 0.4157 | ValAcc 0



[100]	training's binary_logloss: 0.0439688
[200]	training's binary_logloss: 0.00767471
[300]	training's binary_logloss: 0.00208756
[400]	training's binary_logloss: 0.00108716
[500]	training's binary_logloss: 0.000924993
[600]	training's binary_logloss: 0.000965119
[700]	training's binary_logloss: 0.000868085
[800]	training's binary_logloss: 0.000893689
[900]	training's binary_logloss: 0.000915728
[1000]	training's binary_logloss: 0.000886683

--- Ensemble Evaluation (Soft Label) ---

FINAL RESULTS (Soft Label Training)
StrongFC        Acc: 0.6741 | F1: 0.6551
LightGBM        Acc: 0.3605 | F1: 0.2737
Ensemble        Acc: 0.3630 | F1: 0.2779

模型已保存：fc_final_soft.pth  &  lgb_final_soft.txt


In [8]:
# ==============================================================
# 混合策略集成训练：FC用软标签 + LightGBM用硬标签
# ==============================================================

import os, ast, torch, numpy as np, pandas as pd
from tqdm import tqdm
from PIL import Image
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from sentence_transformers import SentenceTransformer
from transformers import RobertaModel, RobertaTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
import lightgbm as lgb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ==============================================================
# 1. 数据集定义（同时支持软硬标签）
# ==============================================================
class MemeDataset(Dataset):
    def __init__(self, csv_file, image_dir, use_soft=True):
        self.df = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.use_soft = use_soft

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = str(self.df.iloc[idx]['id'])

        # 加载图像
        for ext in ['.jpeg', '.jpg', '.png', '.JPEG', '.JPG', '.PNG']:
            path = os.path.join(self.image_dir, img_id + ext)
            if os.path.exists(path):
                img = Image.open(path).convert("RGB")
                break
        else:
            raise FileNotFoundError(f"Image not found: {img_id}")

        text = self.df.iloc[idx]['text']

        # 软标签 or 硬标签
        if self.use_soft:
            soft = ast.literal_eval(self.df.iloc[idx]['task4_soft'])
            label = torch.tensor(soft[1], dtype=torch.float)  # [0,1] 软标签
        else:
            hard = ast.literal_eval(self.df.iloc[idx]['task4_hard'])
            label = torch.tensor(hard.index(1.0), dtype=torch.float)  # 0 or 1

        return {'image': img, 'text': text, 'label': label}

def collate_fn(batch):
    return {
        'image': [b['image'] for b in batch],
        'text' : [b['text']  for b in batch],
        'label': torch.stack([b['label'] for b in batch])
    }

# ==============================================================
# 2. 加载数据
# ==============================================================
CSV_PATH   = 'processed_data_all_labels.csv'
IMAGE_DIR  = '/content/memes/'
BATCH_SIZE = 16

# 软标签数据集（用于FC）
full_ds_soft = MemeDataset(CSV_PATH, IMAGE_DIR, use_soft=True)
train_sz = int(0.8 * len(full_ds_soft))
val_sz   = int(0.1 * len(full_ds_soft))
test_sz  = len(full_ds_soft) - train_sz - val_sz

train_ds_soft, val_ds_soft, test_ds_soft = random_split(
    full_ds_soft, [train_sz, val_sz, test_sz]
)

# 硬标签数据集（用于LightGBM）
full_ds_hard = MemeDataset(CSV_PATH, IMAGE_DIR, use_soft=False)
train_ds_hard, val_ds_hard, test_ds_hard = random_split(
    full_ds_hard, [train_sz, val_sz, test_sz]
)

print(f"Train/Val/Test: {train_sz}/{val_sz}/{test_sz}")

# ==============================================================
# 3. 多模态特征提取（共享特征）
# ==============================================================
clip_model = SentenceTransformer('clip-ViT-B-32').to(device)
roberta    = RobertaModel.from_pretrained('roberta-base').to(device)
tokenizer  = RobertaTokenizer.from_pretrained('roberta-base')
roberta.eval()
clip_model.eval()

def extract_features(loader):
    img_embs, txt_embs, lbls = [], [], []
    for batch in tqdm(loader, desc="Extracting"):
        imgs = batch['image']
        txts = batch['text']
        lbl  = batch['label'].to(device)

        # CLIP 图像特征
        img_feat = clip_model.encode(
            imgs,
            convert_to_tensor=True,
            show_progress_bar=False
        ).to(device)

        # RoBERTa 文本特征
        enc = tokenizer(
            txts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            txt_feat = roberta(**enc).last_hidden_state.mean(dim=1)

        img_embs.append(img_feat)
        txt_embs.append(txt_feat)
        lbls.append(lbl)

    img_all = torch.cat(img_embs, dim=0)
    txt_all = torch.cat(txt_embs, dim=0)
    lbl_all = torch.cat(lbls, dim=0)
    return torch.cat([img_all, txt_all], dim=1), lbl_all

# 提取软标签特征
train_loader_soft = DataLoader(train_ds_soft, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader_soft  = DataLoader(test_ds_soft, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("\n=== 提取软标签特征（用于FC）===")
train_features_soft, train_labels_soft = extract_features(train_loader_soft)
test_features_soft, test_labels_soft = extract_features(test_loader_soft)

# 提取硬标签特征
train_loader_hard = DataLoader(train_ds_hard, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader_hard  = DataLoader(test_ds_hard, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("\n=== 提取硬标签特征（用于LightGBM）===")
train_features_hard, train_labels_hard = extract_features(train_loader_hard)
test_features_hard, test_labels_hard = extract_features(test_loader_hard)

# ==============================================================
# 4. 特征标准化
# ==============================================================
scaler = StandardScaler()

# 软标签特征
train_feat_soft_np = train_features_soft.cpu().numpy()
test_feat_soft_np = test_features_soft.cpu().numpy()
train_feat_soft_scaled = scaler.fit_transform(train_feat_soft_np)
test_feat_soft_scaled = scaler.transform(test_feat_soft_np)

# 硬标签特征（使用相同的scaler）
train_feat_hard_np = train_features_hard.cpu().numpy()
test_feat_hard_np = test_features_hard.cpu().numpy()
train_feat_hard_scaled = scaler.transform(train_feat_hard_np)
test_feat_hard_scaled = scaler.transform(test_feat_hard_np)

print("✅ 特征标准化完成")

# ==============================================================
# 5. 定义 FC 模型（用软标签训练）
# ==============================================================
class StrongFC(nn.Module):
    def __init__(self, input_dim, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout*0.8),
            nn.Linear(128, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(dropout*0.6),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

# 混合损失函数
bce = nn.BCEWithLogitsLoss()
kl = nn.KLDivLoss(reduction='batchmean')

def mixed_loss(logits, targets):
    prob = torch.sigmoid(logits)
    loss_bce = bce(logits, targets)
    loss_kl = kl(prob.log(), targets)
    return 0.7 * loss_bce + 0.3 * loss_kl

# 初始化模型
fc_model = StrongFC(train_feat_soft_scaled.shape[1]).to(device)
optimizer = torch.optim.AdamW(fc_model.parameters(), lr=1e-3, weight_decay=1e-5)

train_features_soft_tensor = torch.FloatTensor(train_feat_soft_scaled).to(device)
test_features_soft_tensor = torch.FloatTensor(test_feat_soft_scaled).to(device)

train_ds_fc = TensorDataset(train_features_soft_tensor, train_labels_soft.to(device))
test_ds_fc = TensorDataset(test_features_soft_tensor, test_labels_soft.to(device))

train_loader_fc = DataLoader(train_ds_fc, batch_size=32, shuffle=True)
test_loader_fc = DataLoader(test_ds_fc, batch_size=32, shuffle=False)

# ==============================================================
# 6. 训练 FC 模型（软标签）
# ==============================================================
print("\n=== 训练 FC 模型（软标签）===")
fc_model.train()
EPOCHS = 50
best_val_f1 = 0.0
patience = 7
no_imp = 0

for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for x, y in train_loader_fc:
        optimizer.zero_grad()
        logits = fc_model(x).squeeze(1)
        loss = mixed_loss(logits, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # 验证
    fc_model.eval()
    preds, true = [], []
    with torch.no_grad():
        for x, y in test_loader_fc:
            prob = torch.sigmoid(fc_model(x)).cpu().numpy()
            preds.extend(prob.flatten())
            true.extend(y.cpu().numpy())
    fc_model.train()

    preds_bin = (np.array(preds) > 0.5).astype(int)
    true_bin = (np.array(true) > 0.5).astype(int)
    val_acc = accuracy_score(true_bin, preds_bin)
    val_f1 = f1_score(true_bin, preds_bin, average='macro')

    print(f"Epoch {epoch+1:2d} | Loss {epoch_loss/len(train_loader_fc):.4f} | ValAcc {val_acc:.4f} | ValF1 {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(fc_model.state_dict(), 'best_fc_soft.pth')
        no_imp = 0
    else:
        no_imp += 1
        if no_imp >= patience:
            print("⏸ Early stopping triggered")
            break

fc_model.load_state_dict(torch.load('best_fc_soft.pth'))
print(f"✅ FC 训练完成，Best Val F1: {best_val_f1:.4f}")

# ==============================================================
# 7. 训练 LightGBM（硬标签）
# ==============================================================
print("\n=== 训练 LightGBM（硬标签）===")

# 转换为硬标签
train_labels_hard_binary = (train_labels_hard.cpu().numpy() > 0.5).astype(int)

lgb_train = lgb.Dataset(train_feat_hard_scaled, label=train_labels_hard_binary)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 64,
    'max_depth': 10,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 42
}

lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

print("✅ LightGBM 训练完成")

# ==============================================================
# 8. 混合集成预测
# ==============================================================
print("\n=== 混合集成预测 ===")

# FC 预测（软标签训练）
fc_model.eval()
fc_probs = []
with torch.no_grad():
    for x, _ in test_loader_fc:
        prob = torch.sigmoid(fc_model(x)).cpu().numpy().flatten()
        fc_probs.extend(prob)
fc_probs = np.array(fc_probs)

# LightGBM 预测（硬标签训练）
lgb_probs = lgb_model.predict(test_feat_hard_scaled)

# 集成策略：FC权重更高（因为软标签效果更好）
ensemble_prob = 0.6 * fc_probs + 0.4 * lgb_probs
ensemble_pred = (ensemble_prob > 0.5).astype(int)

# 真实标签（二值化）
y_true = (test_labels_soft.cpu().numpy() > 0.5).astype(int)

# ==============================================================
# 9. 评估结果
# ==============================================================
def evaluate_model(name, probs):
    preds = (probs > 0.5).astype(int)
    acc = accuracy_score(y_true, preds)
    f1 = f1_score(y_true, preds, average='macro')
    return acc, f1

print("\n" + "="*60)
print("📊 最终结果对比")
print("="*60)

fc_acc, fc_f1 = evaluate_model("FC (软标签)", fc_probs)
lgb_acc, lgb_f1 = evaluate_model("LightGBM (硬标签)", lgb_probs)
ens_acc, ens_f1 = evaluate_model("集成 (0.6*FC + 0.4*LGB)", ensemble_prob)

print(f"{'模型':<25} {'Accuracy':<12} {'F1-Score'}")
print("-" * 60)
print(f"{'FC (软标签)':<25} {fc_acc:<12.4f} {fc_f1:.4f}")
print(f"{'LightGBM (硬标签)':<25} {lgb_acc:<12.4f} {lgb_f1:.4f}")
print(f"{'集成 (0.6*FC + 0.4*LGB)':<25} {ens_acc:<12.4f} {ens_f1:.4f} ⭐")
print("=" * 60)

# 详细分类报告
print("\n📋 集成模型详细报告：")
print(classification_report(y_true, ensemble_pred, target_names=['Class 0', 'Class 1']))

# ==============================================================
# 10. 保存模型
# ==============================================================
torch.save(fc_model.state_dict(), 'fc_soft_final.pth')
lgb_model.save_model('lgb_hard_final.txt')
print("\n💾 模型已保存：fc_soft_final.pth & lgb_hard_final.txt")

# ==============================================================
# 11. 权重调优（可选）
# ==============================================================
print("\n🔧 尝试不同集成权重：")
print("-" * 60)
for w in [0.5, 0.55, 0.6, 0.65, 0.7]:
    ens = w * fc_probs + (1-w) * lgb_probs
    ens_pred = (ens > 0.5).astype(int)
    acc = accuracy_score(y_true, ens_pred)
    f1 = f1_score(y_true, ens_pred, average='macro')
    print(f"权重 {w:.2f}*FC + {1-w:.2f}*LGB  →  Acc: {acc:.4f}, F1: {f1:.4f}")
print("=" * 60)

Using device: cuda
Train/Val/Test: 3235/404/405


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== 提取软标签特征（用于FC）===


Extracting: 100%|██████████| 203/203 [01:11<00:00,  2.83it/s]
Extracting: 100%|██████████| 26/26 [00:08<00:00,  2.91it/s]



=== 提取硬标签特征（用于LightGBM）===


Extracting: 100%|██████████| 203/203 [01:11<00:00,  2.85it/s]
Extracting: 100%|██████████| 26/26 [00:08<00:00,  3.12it/s]


✅ 特征标准化完成

=== 训练 FC 模型（软标签）===
Epoch  1 | Loss 0.5108 | ValAcc 0.5728 | ValF1 0.5682
Epoch  2 | Loss 0.4933 | ValAcc 0.6370 | ValF1 0.6245
Epoch  3 | Loss 0.4809 | ValAcc 0.6395 | ValF1 0.6176
Epoch  4 | Loss 0.4723 | ValAcc 0.6519 | ValF1 0.6338
Epoch  5 | Loss 0.4633 | ValAcc 0.6321 | ValF1 0.6112
Epoch  6 | Loss 0.4565 | ValAcc 0.6272 | ValF1 0.6120
Epoch  7 | Loss 0.4497 | ValAcc 0.6296 | ValF1 0.6081
Epoch  8 | Loss 0.4463 | ValAcc 0.6667 | ValF1 0.6468
Epoch  9 | Loss 0.4400 | ValAcc 0.6420 | ValF1 0.6207
Epoch 10 | Loss 0.4351 | ValAcc 0.6272 | ValF1 0.6157
Epoch 11 | Loss 0.4303 | ValAcc 0.6395 | ValF1 0.6176
Epoch 12 | Loss 0.4259 | ValAcc 0.6395 | ValF1 0.6252
Epoch 13 | Loss 0.4193 | ValAcc 0.6173 | ValF1 0.6081
Epoch 14 | Loss 0.4150 | ValAcc 0.6395 | ValF1 0.6166
Epoch 15 | Loss 0.4147 | ValAcc 0.6494 | ValF1 0.6186
⏸ Early stopping triggered
✅ FC 训练完成，Best Val F1: 0.6468

=== 训练 LightGBM（硬标签）===




[100]	training's binary_logloss: 0.131511
[200]	training's binary_logloss: 0.0333385
[300]	training's binary_logloss: 0.010415
[400]	training's binary_logloss: 0.00441888
[500]	training's binary_logloss: 0.00236646
[600]	training's binary_logloss: 0.00181958
[700]	training's binary_logloss: 0.00158907
[800]	training's binary_logloss: 0.00150369
[900]	training's binary_logloss: 0.00146902
[1000]	training's binary_logloss: 0.00147069
✅ LightGBM 训练完成

=== 混合集成预测 ===

📊 最终结果对比
模型                        Accuracy     F1-Score
------------------------------------------------------------
FC (软标签)                  0.6667       0.6468
LightGBM (硬标签)            0.5432       0.4706
集成 (0.6*FC + 0.4*LGB)     0.6247       0.5778 ⭐

📋 集成模型详细报告：
              precision    recall  f1-score   support

     Class 0       0.73      0.71      0.72       275
     Class 1       0.42      0.45      0.44       130

    accuracy                           0.62       405
   macro avg       0.58      0.58      0.5