In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import train_test_split



In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
df = pd.read_csv("/content/drive/MyDrive/dev_phase/subtask1/train/zho.csv")
X_text = df['text'].tolist()
y_labels = df['polarization'].tolist()

In [10]:
model_name = "Qwen/Qwen3-Embedding-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to(device)
embedding_model.eval()

for p in embedding_model.parameters():
    p.requires_grad = False

def mean_pooling(model_output, attention_mask):
    token_embeds = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    sum_embeddings = torch.sum(token_embeds * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


In [11]:
def get_all_embeddings(texts, model, tokenizer, device, batch_size=32):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch_texts = texts[i:i+batch_size]
        enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**enc)
            embeddings = mean_pooling(outputs, enc['attention_mask'])
        all_embs.append(embeddings.cpu())
    return torch.cat(all_embs, dim=0)


X_embeddings = get_all_embeddings(X_text, embedding_model, tokenizer, device)
y_tensor = torch.tensor(y_labels, dtype=torch.long)

dataset = TensorDataset(X_embeddings, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)


Embedding: 100%|██████████| 134/134 [00:47<00:00,  2.80it/s]


In [25]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_embeddings, y_tensor, test_size=0.3, random_state=42, stratify=y_tensor
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [26]:
class Classifier(nn.Module):
    def __init__(self, embed_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(embed_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        return self.model(x)


In [27]:
input_dim = X_embeddings.shape[1]   # Must match embedding size
num_classes = len(np.unique(y_tensor.numpy()))

classifier = Classifier(input_dim, num_classes).to(device)
classifier.load_state_dict(torch.load("/content/classifier_split.pth"))
classifier.eval()

print("Loaded classifier!")


Loaded classifier!


In [31]:
classifier.train()
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
best_val_f1 = 0.0
best_state = None
epochs = 20
for epoch in range(epochs):
    classifier.train()
    total_loss = 0
    for bx, by in train_loader:
        bx, by = bx.to(device), by.to(device)
        optimizer.zero_grad()
        logits = classifier(bx)
        loss = criterion(logits, by)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # -------------------------------
    # Validation evaluation
    # -------------------------------
    classifier.eval()
    val_logits = classifier(X_val.to(device))
    val_preds = torch.argmax(val_logits, dim=1).cpu().numpy()
    val_labels = y_val.numpy()
    from sklearn.metrics import f1_score
    val_f1 = f1_score(val_labels, val_preds, average='macro')

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Val Macro F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_state = classifier.state_dict().copy()



Epoch 1/20 - Loss: 24.7918 - Val Macro F1: 0.8315
Epoch 2/20 - Loss: 24.0150 - Val Macro F1: 0.8332
Epoch 3/20 - Loss: 22.6385 - Val Macro F1: 0.8362
Epoch 4/20 - Loss: 21.4534 - Val Macro F1: 0.8333
Epoch 5/20 - Loss: 20.2485 - Val Macro F1: 0.8314
Epoch 6/20 - Loss: 19.5076 - Val Macro F1: 0.8318
Epoch 7/20 - Loss: 18.7005 - Val Macro F1: 0.8331
Epoch 8/20 - Loss: 18.0102 - Val Macro F1: 0.8318
Epoch 9/20 - Loss: 16.2541 - Val Macro F1: 0.8348
Epoch 10/20 - Loss: 16.1654 - Val Macro F1: 0.8317
Epoch 11/20 - Loss: 15.1743 - Val Macro F1: 0.8316
Epoch 12/20 - Loss: 14.3647 - Val Macro F1: 0.8339
Epoch 13/20 - Loss: 13.7377 - Val Macro F1: 0.8346
Epoch 14/20 - Loss: 13.2516 - Val Macro F1: 0.8301
Epoch 15/20 - Loss: 12.1909 - Val Macro F1: 0.8285
Epoch 16/20 - Loss: 11.9270 - Val Macro F1: 0.8332
Epoch 17/20 - Loss: 11.2876 - Val Macro F1: 0.8328
Epoch 18/20 - Loss: 10.5049 - Val Macro F1: 0.8377
Epoch 19/20 - Loss: 9.7235 - Val Macro F1: 0.8407
Epoch 20/20 - Loss: 9.2865 - Val Macro F1

In [32]:
classifier.load_state_dict(best_state)
classifier.eval()

test_logits = classifier(X_test.to(device))
test_preds = torch.argmax(test_logits, dim=1).cpu().numpy()
test_labels = y_test.numpy()

print("\n=== Test Classification Report ===")
from sklearn.metrics import classification_report
print(classification_report(test_labels, test_preds, digits=4))


=== Test Classification Report ===
              precision    recall  f1-score   support

           0     0.8409    0.7994    0.8196       324
           1     0.8054    0.8459    0.8252       318

    accuracy                         0.8224       642
   macro avg     0.8231    0.8226    0.8224       642
weighted avg     0.8233    0.8224    0.8224       642



In [33]:
torch.save(classifier.state_dict(), "classifier_zho_finetuned.pth")
print("Saved fine-tuned classifier to classifier_zho_finetuned.pth")

Saved fine-tuned classifier to classifier_zho_finetuned.pth
