In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import train_test_split



In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv("/content/drive/MyDrive/dev_phase/subtask1/train/zho.csv")
X_text = df['text'].tolist()
y_labels = df['polarization'].tolist()

In [5]:
model_name = "BAAI/bge-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to(device)
embedding_model.eval()

for p in embedding_model.parameters():
    p.requires_grad = False

def mean_pooling(model_output, attention_mask):
    token_embeds = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    sum_embeddings = torch.sum(token_embeds * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

In [6]:
def get_all_embeddings(texts, model, tokenizer, device, batch_size=32):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch_texts = texts[i:i+batch_size]
        enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**enc)
            embeddings = mean_pooling(outputs, enc['attention_mask'])
        all_embs.append(embeddings.cpu())
    return torch.cat(all_embs, dim=0)


X_embeddings = get_all_embeddings(X_text, embedding_model, tokenizer, device)
y_tensor = torch.tensor(y_labels, dtype=torch.long)

dataset = TensorDataset(X_embeddings, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)



Embedding:   0%|          | 0/134 [00:00<?, ?it/s][A
Embedding:   1%|          | 1/134 [00:01<03:57,  1.79s/it][A
Embedding:   1%|▏         | 2/134 [00:01<01:50,  1.19it/s][A
Embedding:   2%|▏         | 3/134 [00:02<01:15,  1.73it/s][A
Embedding:   3%|▎         | 4/134 [00:02<00:56,  2.29it/s][A
Embedding:   4%|▎         | 5/134 [00:02<00:44,  2.89it/s][A
Embedding:   4%|▍         | 6/134 [00:02<00:37,  3.41it/s][A
Embedding:   5%|▌         | 7/134 [00:03<00:37,  3.36it/s][A
Embedding:   6%|▌         | 8/134 [00:03<00:36,  3.43it/s][A
Embedding:   7%|▋         | 9/134 [00:03<00:33,  3.77it/s][A
Embedding:   7%|▋         | 10/134 [00:03<00:32,  3.76it/s][A
Embedding:   8%|▊         | 11/134 [00:04<00:32,  3.74it/s][A
Embedding:   9%|▉         | 12/134 [00:04<00:32,  3.73it/s][A
Embedding:  10%|▉         | 13/134 [00:04<00:30,  3.99it/s][A
Embedding:  10%|█         | 14/134 [00:04<00:28,  4.21it/s][A
Embedding:  11%|█         | 15/134 [00:05<00:33,  3.55it/s][A
Embedding

In [7]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_embeddings, y_tensor, test_size=0.3, random_state=42, stratify=y_tensor
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [8]:
class Classifier(nn.Module):
    def __init__(self, embed_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(embed_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        return self.model(x)


In [9]:
input_dim = X_embeddings.shape[1]   # Must match embedding size
num_classes = len(np.unique(y_tensor.numpy()))

classifier = Classifier(input_dim, num_classes).to(device)
classifier.load_state_dict(torch.load("/content/classifier_split_bge.pth"))
classifier.eval()

print("Loaded classifier!")


Loaded classifier!


In [14]:
classifier.train()
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
best_val_f1 = 0.0
best_state = None
epochs = 20
for epoch in range(epochs):
    classifier.train()
    total_loss = 0
    for bx, by in train_loader:
        bx, by = bx.to(device), by.to(device)
        optimizer.zero_grad()
        logits = classifier(bx)
        loss = criterion(logits, by)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # -------------------------------
    # Validation evaluation
    # -------------------------------
    classifier.eval()
    val_logits = classifier(X_val.to(device))
    val_preds = torch.argmax(val_logits, dim=1).cpu().numpy()
    val_labels = y_val.numpy()
    from sklearn.metrics import f1_score
    val_f1 = f1_score(val_labels, val_preds, average='macro')

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Val Macro F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_state = classifier.state_dict().copy()



Epoch 1/20 - Loss: 3.1165 - Val Macro F1: 0.8519
Epoch 2/20 - Loss: 2.6700 - Val Macro F1: 0.8520
Epoch 3/20 - Loss: 2.5103 - Val Macro F1: 0.8551
Epoch 4/20 - Loss: 2.2634 - Val Macro F1: 0.8551
Epoch 5/20 - Loss: 2.0580 - Val Macro F1: 0.8504
Epoch 6/20 - Loss: 1.9206 - Val Macro F1: 0.8566
Epoch 7/20 - Loss: 1.8139 - Val Macro F1: 0.8504
Epoch 8/20 - Loss: 1.5714 - Val Macro F1: 0.8535
Epoch 9/20 - Loss: 1.5428 - Val Macro F1: 0.8503
Epoch 10/20 - Loss: 1.3815 - Val Macro F1: 0.8535
Epoch 11/20 - Loss: 1.4080 - Val Macro F1: 0.8520
Epoch 12/20 - Loss: 1.3315 - Val Macro F1: 0.8504
Epoch 13/20 - Loss: 1.0633 - Val Macro F1: 0.8566
Epoch 14/20 - Loss: 1.0705 - Val Macro F1: 0.8534
Epoch 15/20 - Loss: 1.0559 - Val Macro F1: 0.8519
Epoch 16/20 - Loss: 0.8974 - Val Macro F1: 0.8472
Epoch 17/20 - Loss: 0.8795 - Val Macro F1: 0.8519
Epoch 18/20 - Loss: 0.8859 - Val Macro F1: 0.8535
Epoch 19/20 - Loss: 0.7348 - Val Macro F1: 0.8535
Epoch 20/20 - Loss: 0.6682 - Val Macro F1: 0.8566


In [15]:
classifier.load_state_dict(best_state)
classifier.eval()

test_logits = classifier(X_test.to(device))
test_preds = torch.argmax(test_logits, dim=1).cpu().numpy()
test_labels = y_test.numpy()

print("\n=== Test Classification Report ===")
from sklearn.metrics import classification_report
print(classification_report(test_labels, test_preds, digits=4))


=== Test Classification Report ===
              precision    recall  f1-score   support

           0     0.8530    0.8241    0.8383       324
           1     0.8267    0.8553    0.8408       318

    accuracy                         0.8396       642
   macro avg     0.8399    0.8397    0.8396       642
weighted avg     0.8400    0.8396    0.8395       642



In [17]:
torch.save(classifier.state_dict(), "classifier_zho_bge_finetuned.pth")
print("Saved fine-tuned classifier to classifier_zho_finetuned.pth")

Saved fine-tuned classifier to classifier_zho_finetuned.pth
