In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [5]:
# 2. Load CSVs
train_feats = pd.read_csv('train_tfidf_features.csv')  # columns: id, label, 0,1,...,4999
test_feats  = pd.read_csv('test_tfidf_features.csv')   # columns: id, 0,1,...,4999
train_raw   = pd.read_csv('train.csv')
test_raw    = pd.read_csv('test.csv')

# 3. Extract feature arrays and labels
X_full   = train_feats.drop(columns=['id','label'], errors='ignore').values   # shape (n_train,5000)
y_full   = train_feats['label'].values                                        # shape (n_train,)
X_test   = test_feats .drop(columns=['id'], errors='ignore').values           # shape (n_test,5000)
test_ids = test_feats['id'].values                                             # shape (n_test,)

# 4. Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
)
print(f"Shapes → Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}\n")

# 5. PCA + KNN for each component count
for comps in [2000, 1000, 500, 100]:
    n_comp = min(comps, X_train.shape[1])
    pca = PCA(n_components=n_comp, random_state=42)
    X_tr_pca  = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    X_te_pca  = pca.transform(X_test)

    assert X_tr_pca.shape[1] == n_comp
    assert X_val_pca.shape[1] == n_comp
    assert X_te_pca.shape[1] == n_comp

    knn = KNeighborsClassifier(n_neighbors=2)
    knn.fit(X_tr_pca, y_train)

    # Validation
    yv = knn.predict(X_val_pca)
    acc = accuracy_score(y_val, yv)
    m_f1 = f1_score(y_val, yv, average='macro')
    print(f"[Task 2] PCA {n_comp} → Val Accuracy: {acc:.4f}, Macro F1: {m_f1:.4f}")
    print(classification_report(y_val, yv))

    # Test predictions & save
    y_test_pred = knn.predict(X_te_pca)
    submission_df = pd.DataFrame({'id': test_ids, 'label': y_test_pred})
    filename = f'PCA_{n_comp}_KNN.csv'
    submission_df.to_csv(filename, index=False)
    print(f"Saved {filename}")
    print("Preview:")
    print(submission_df.head(10))
    print("-"*60 + "\n")

Shapes → Train: (13747, 5000), Val: (3437, 5000), Test: (4296, 5000)

[Task 2] PCA 2000 → Val Accuracy: 0.4987, Macro F1: 0.4979
              precision    recall  f1-score   support

           0       0.67      0.37      0.48      2127
           1       0.41      0.71      0.52      1310

    accuracy                           0.50      3437
   macro avg       0.54      0.54      0.50      3437
weighted avg       0.57      0.50      0.49      3437

Saved PCA_2000_KNN.csv
Preview:
      id  label
0  17185      1
1  17186      1
2  17187      1
3  17188      1
4  17189      0
5  17190      1
6  17191      1
7  17192      0
8  17193      0
9  17194      1
------------------------------------------------------------

[Task 2] PCA 1000 → Val Accuracy: 0.5982, Macro F1: 0.5579
              precision    recall  f1-score   support

           0       0.66      0.73      0.69      2127
           1       0.47      0.39      0.42      1310

    accuracy                           0.60      34