In [2]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from collections import defaultdict



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
df = pd.read_csv("../dev_phase/subtask1/train/arb.csv")  
X_text = df['text'].tolist()
y_labels = df['polarization'].tolist()

In [14]:
print(df.head())

                                     id  \
0  arb_a2a60c8b4af3389e842d8ec31afb0eea   
1  arb_6723e56a672674a6c1d9b28b213c4a05   
2  arb_b0365d606edeee38ae6c025b1ca33e96   
3  arb_858c0ee684049ba6f416a6cecb0b0761   
4  arb_bdafc73afd0bc2cd2badae2a089446b9   

                                                text  polarization  
0  احلام انتي ونعالي ومنو انتي حتى تقيمين الفناني...             1  
1  وره الكواليس تنيجج من وره بعير صطناعي على فكرة...             1  
2  .خخخخ الملكه احلام فيها شذوذ شنو هل بوس والدلع...             1  
3  الله يخزي احلام هي والبرنامج الخايس الي كله مصخره             1  
4  كس ام احلام الي ماربتها وش ملكه هههه متستاهل م...             1  


In [15]:
model_name = "Qwen/Qwen3-Embedding-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to(device)
embedding_model.eval()

for p in embedding_model.parameters():
    p.requires_grad = False

def mean_pooling(model_output, attention_mask):
    token_embeds = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    sum_embeddings = torch.sum(token_embeds * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


In [16]:
batch_size = 32
emb_list = []

for i in tqdm(range(0, len(X_text), batch_size), desc="Embedding"):
    batch_texts = X_text[i:i+batch_size]
    enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = embedding_model(**enc)
        embeddings = mean_pooling(outputs, enc['attention_mask'])
    emb_list.append(embeddings.cpu())

X_embeddings = torch.cat(emb_list, dim=0)
y_tensor = torch.tensor(y_labels, dtype=torch.long)



Embedding:  22%|██▏       | 23/106 [05:57<21:29, 15.53s/it]


KeyboardInterrupt: 

In [10]:
class Classifier(nn.Module):
    def __init__(self, embed_dim, num_classes):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(embed_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        return self.model(x)


In [11]:
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
num_classes = len(np.unique(y_labels))
X = X_embeddings.numpy()
y = y_tensor.numpy()

all_acc = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"\n=== Fold {fold+1} ===")
    
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Torch tensors
    X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
    y_test_t = torch.tensor(y_test, dtype=torch.long).to(device)
    
    # Initialize classifier
    classifier = Classifier(X_train_t.shape[1], num_classes).to(device)
    optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    
    # Train for few epochs
    classifier.train()
    train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
    for epoch in range(5):
        for bx, by in train_loader:
            optimizer.zero_grad()
            logits = classifier(bx)
            loss = criterion(logits, by)
            loss.backward()
            optimizer.step()
    
    # Evaluate
    classifier.eval()
    with torch.no_grad():
        logits = classifier(X_test_t)
        y_pred = torch.argmax(logits, dim=1).cpu().numpy()
    
    acc = np.mean(y_pred == y_test)
    all_acc.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")

# -------------------------------
# 5️⃣ Cross-validation summary
# -------------------------------
mean_acc = np.mean(all_acc)
std_acc = np.std(all_acc)
print(f"\n=== Cross-Validation Summary ===")
print(f"Average Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")


=== Fold 1 ===
Fold 1 Accuracy: 0.7556

=== Fold 2 ===
Fold 2 Accuracy: 0.7526

=== Fold 3 ===
Fold 3 Accuracy: 0.7673

=== Fold 4 ===
Fold 4 Accuracy: 0.7733

=== Fold 5 ===
Fold 5 Accuracy: 0.7568

=== Cross-Validation Summary ===
Average Accuracy: 0.7611 ± 0.0078


In [None]:
# Compute overall mean ± std accuracy
mean_acc = np.mean(all_acc)
std_acc = np.std(all_acc)

# Create a simple DataFrame
summary = pd.DataFrame({
    'Accuracy Mean': [mean_acc],
    'Standard Deviation': [std_acc]
})

print("\n=== Cross-Validation Summary ===")
print(summary)




=== Cross-Validation Summary ===
   Accuracy Mean  Standard Deviation
0       0.761109            0.007835
