In [None]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from collections import defaultdict



: 

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("../dev_phase/subtask1/train/zho.csv")
X_text = df['text'].tolist()
y_labels = df['polarization'].tolist()

In [None]:
print(df.head())

In [None]:
model_name = "Qwen/Qwen3-Embedding-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to(device)
embedding_model.eval()

for p in embedding_model.parameters():
    p.requires_grad = False

def mean_pooling(model_output, attention_mask):
    token_embeds = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    sum_embeddings = torch.sum(token_embeds * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


In [None]:
batch_size = 32
emb_list = []

for i in tqdm(range(0, len(X_text), batch_size), desc="Embedding"):
    batch_texts = X_text[i:i+batch_size]
    enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = embedding_model(**enc)
        embeddings = mean_pooling(outputs, enc['attention_mask'])
    emb_list.append(embeddings.cpu())

X_embeddings = torch.cat(emb_list, dim=0)
y_tensor = torch.tensor(y_labels, dtype=torch.long)



In [None]:
class PolarizationClassifier(nn.Module):
    def __init__(self, embed_dim, num_classes=5):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(embed_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
            # No sigmoid here: use BCEWithLogitsLoss for stability
        )

    def forward(self, x):
        return self.model(x)