In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from collections import defaultdict



In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [109]:
df = pd.read_csv("/content/drive/MyDrive/dev_phase/subtask2/train/zho.csv")
X_text = df['text'].tolist()
label_cols = ['political', 'racial/ethnic', 'religious', 'gender/sexual', 'other']
y_labels = df[label_cols].values

In [110]:
print(df.head())

                                     id                                 text  \
0  zho_e59ce789a1d83d91cb7d362efaa3bf23           好久都没有见过那么干净的白人班级了，一个黑乐色都没有   
1  zho_ea3ef5567698fc40fa631baf7f697a16   楼主这讽刺太有力度把跪虫族和国际鬼子虫类及其它们后代虫混描写淋漓尽致   
2  zho_b73779673957f6dab5688c27a6747458           你这样说让那些标榜找个外国对象倍有面的人还怎么装b�   
3  zho_f7164a14baafbcb6ec387b0217a1c766  说明一下，我只关爱自己身边的女性，比如老婆女儿等，别的女人关我鸟事。v   
4  zho_1be506c572b805494787f548671a5bb7          基佬还说之前视频里的不是它，它都说照片是它爆出来的了。   

   political  racial/ethnic  religious  gender/sexual  other  
0          0              1          0              0      0  
1          0              1          0              0      0  
2          0              1          0              0      0  
3          0              0          0              1      0  
4          0              0          0              1      0  


In [89]:
model_name = "Qwen/Qwen3-Embedding-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to(device)
embedding_model.eval()

for p in embedding_model.parameters():
    p.requires_grad = False

def mean_pooling(model_output, attention_mask):
    token_embeds = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    sum_embeddings = torch.sum(token_embeds * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


In [111]:
batch_size = 32
emb_list = []

for i in tqdm(range(0, len(X_text), batch_size), desc="Embedding"):
    batch_texts = X_text[i:i+batch_size]
    enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = embedding_model(**enc)
        embeddings = mean_pooling(outputs, enc['attention_mask'])
    emb_list.append(embeddings.cpu())

X_embeddings = torch.cat(emb_list, dim=0)
y_tensor = torch.tensor(y_labels, dtype=torch.long)



Embedding: 100%|██████████| 134/134 [00:49<00:00,  2.72it/s]


In [112]:
class PolarizationClassifier(nn.Module):
    def __init__(self, embed_dim, num_classes=5):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 384)
        self.bn1 = nn.BatchNorm1d(384)
        self.drop = nn.Dropout(0.3)
        self.fc2 = nn.Linear(384, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.drop(x)
        return self.fc2(x)


In [17]:
!pip install iterative-stratification


Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [114]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

k = 5
kf = MultilabelStratifiedKFold(n_splits=k, shuffle=True, random_state=42)
num_classes = y_tensor.shape[1]
print(num_classes)
X = X_embeddings.numpy()
y = y_tensor.numpy()

all_acc = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    print(f"\n=== Fold {fold+1} ===")

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
    y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device)

    classifier = PolarizationClassifier(X_train_t.shape[1], num_classes).to(device)
    optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([3,3,3,3,3]).to(device))

    classifier.train()
    train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
    for epoch in range(5):
        for bx, by in train_loader:
            optimizer.zero_grad()
            logits = classifier(bx)
            loss = criterion(logits, by)
            loss.backward()
            optimizer.step()

    classifier.eval()
    with torch.no_grad():
        logits = classifier(X_test_t)
        probs = torch.sigmoid(logits)
        y_pred = (probs > 0.5).cpu().numpy()

    sample_acc = np.mean(np.all(y_pred == y_test, axis=1))
    all_acc.append(sample_acc)
    print(f"Fold {fold+1} Accuracy: {sample_acc:.4f}")

print("\n=== Cross-Validation Summary ===")
print(f"Average Accuracy: {np.mean(all_acc):.4f} ± {np.std(all_acc):.4f}")

5

=== Fold 1 ===
Fold 1 Accuracy: 0.7021

=== Fold 2 ===
Fold 2 Accuracy: 0.7044

=== Fold 3 ===
Fold 3 Accuracy: 0.7278

=== Fold 4 ===
Fold 4 Accuracy: 0.7079

=== Fold 5 ===
Fold 5 Accuracy: 0.6928

=== Cross-Validation Summary ===
Average Accuracy: 0.7070 ± 0.0116


In [115]:
# Compute overall mean ± std accuracy
mean_acc = np.mean(all_acc)
std_acc = np.std(all_acc)

# Create a simple DataFrame
summary = pd.DataFrame({
    'Accuracy Mean': [mean_acc],
    'Standard Deviation': [std_acc]
})

print("\n=== Cross-Validation Summary ===")
print(summary)




=== Cross-Validation Summary ===
   Accuracy Mean  Standard Deviation
0       0.707009            0.011551


In [101]:
from sklearn.metrics import f1_score

macro_f1 = f1_score(y_test, y_pred, average='macro')
micro_f1 = f1_score(y_test, y_pred, average='micro')

print(f"Fold {fold+1} Macro-F1: {macro_f1:.4f}, Micro-F1: {micro_f1:.4f}")


Fold 5 Macro-F1: 0.3859, Micro-F1: 0.5780


In [None]:
#save