# Task 2: Apply Dimension Reduction Techniques (10 marks)
# 
#### **Objective:** Apply Principal Component Analysis (PCA) to reduce the dimensionality of TF-IDF features from 5000 dimensions to various lower dimensions, then evaluate performance using K-Nearest Neighbors (KNN) classifier.
# 

#### **Key Task Deliverables:**
#### - **2a.** Code implementation of PCA on train and test sets (sklearn package allowed)
#### - **2b.** Report Macro F1 scores for 2000, 1000, 500, and 100 components using KNN (n_neighbors=2)


## Import Required Libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
import seaborn as sns


### Data Loading and Preparation

In [2]:
def load_and_prepare_data():
    """
    Load and prepare the training and test data
    """
    print("Loading training data...")
    train_data = pd.read_csv('data/train_tfidf_features.csv')
    
    print("Loading test data...")
    test_data = pd.read_csv('data/test_tfidf_features.csv')
    
    print(f"Training data shape: {train_data.shape}")
    print(f"Test data shape: {test_data.shape}")
    
    # Separate features and labels
    X_train = train_data.drop(['id', 'label'], axis=1).values
    y_train = train_data['label'].values
    
    X_test = test_data.drop(['id'], axis=1).values
    
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    
    # Check for any missing values
    print(f"Missing values in X_train: {np.isnan(X_train).sum()}")
    print(f"Missing values in y_train: {np.isnan(y_train).sum()}")
    print(f"Missing values in X_test: {np.isnan(X_test).sum()}")
    
    # Handle any missing values
    X_train = np.nan_to_num(X_train, nan=0.0)
    X_test = np.nan_to_num(X_test, nan=0.0)
    
    return X_train, y_train, X_test, train_data['id'].values, test_data['id'].values


### Load and prepare data


In [3]:
print("=== Task 2: Dimension Reduction using PCA ===")
print("Loading and preparing data...")
X_train, y_train, X_test, train_ids, test_ids = load_and_prepare_data()

=== Task 2: Dimension Reduction using PCA ===
Loading and preparing data...
Loading training data...
Loading test data...
Training data shape: (17184, 5002)
Test data shape: (4296, 5001)
X_train shape: (17184, 5000)
y_train shape: (17184,)
X_test shape: (4296, 5000)
Missing values in X_train: 0
Missing values in y_train: 0
Missing values in X_test: 0


### Display class distribution

In [4]:
print(f"\nClass distribution in training data:")
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} samples ({count/len(y_train)*100:.2f}%)")



Class distribution in training data:
Class 0: 10633 samples (61.88%)
Class 1: 6551 samples (38.12%)


### Task 2a: Code Implementation of PCA on Train and Test Sets


In [5]:
def apply_pca_and_evaluate(X_train, y_train, X_test, n_components_list):
    """
    Apply PCA with different numbers of components and evaluate using KNN
    
    Task 2a: PCA implementation on train and test sets using sklearn
    Task 2b: KNN training with n_neighbors=2 for Macro F1 evaluation
    """
    # Create validation split for F1 score evaluation
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    results = {}
    
    for n_components in n_components_list:
        print(f"\n=== PCA with {n_components} components ===")
        
        # Task 2a: Apply PCA using sklearn
        print("Applying PCA transformation...")
        pca = PCA(n_components=n_components, random_state=42)
        X_train_pca = pca.fit_transform(X_train_split)
        X_val_pca = pca.transform(X_val_split)
        X_test_pca = pca.transform(X_test)
        
        print(f"Original feature space: {X_train.shape[1]} dimensions")
        print(f"Reduced feature space: {X_train_pca.shape[1]} dimensions")
        print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")
        
        # Task 2b: Train KNN model with n_neighbors=2
        print("Training KNN classifier (n_neighbors=2)...")
        knn = KNeighborsClassifier(n_neighbors=2)
        knn.fit(X_train_pca, y_train_split)
        
        # Make predictions on validation set for F1 score evaluation
        y_pred_val = knn.predict(X_val_pca)
        
        # Calculate validation metrics
        f1_macro = f1_score(y_val_split, y_pred_val, average='macro')
        accuracy = accuracy_score(y_val_split, y_pred_val)
        
        print(f"PCA with {n_components} components: Accuracy = {accuracy}, F1 Score = {f1_macro}")
        
        # Make predictions on test set for Kaggle submission
        y_pred = knn.predict(X_test_pca)
        
        # Store results
        results[n_components] = {
            'pca': pca,
            'knn': knn,
            'X_train_pca': X_train_pca,
            'X_test_pca': X_test_pca,
            'predictions': y_pred,
            'explained_variance_ratio': pca.explained_variance_ratio_.sum()
        }
        
        print(f"PCA with {n_components} components completed")
    
    return results


### Task 2b: Define components to test as per requirements

In [6]:
n_components_list = [2000, 1000, 500, 100]

print(f"\n=== Applying PCA with different numbers of components ===")
print(f"Testing with components: {n_components_list}")

results = apply_pca_and_evaluate(X_train, y_train, X_test, n_components_list)



=== Applying PCA with different numbers of components ===
Testing with components: [2000, 1000, 500, 100]

=== PCA with 2000 components ===
Applying PCA transformation...
Original feature space: 5000 dimensions
Reduced feature space: 2000 dimensions
Explained variance ratio: 0.8345
Training KNN classifier (n_neighbors=2)...
PCA with 2000 components: Accuracy = 0.49869071864998543, F1 Score = 0.49788123970603515
PCA with 2000 components completed

=== PCA with 1000 components ===
Applying PCA transformation...
Original feature space: 5000 dimensions
Reduced feature space: 1000 dimensions
Explained variance ratio: 0.6544
Training KNN classifier (n_neighbors=2)...
PCA with 1000 components: Accuracy = 0.5987780040733197, F1 Score = 0.5583393880128434
PCA with 1000 components completed

=== PCA with 500 components ===
Applying PCA transformation...
Original feature space: 5000 dimensions
Reduced feature space: 500 dimensions
Explained variance ratio: 0.4866
Training KNN classifier (n_neigh

## Results Summary

In [7]:
print(f"\n=== Results Summary ===")
print("Number of Components | Explained Variance Ratio")
print("-" * 45)
for n_components in n_components_list:
    var_ratio = results[n_components]['explained_variance_ratio']
    print(f"{n_components:^19} | {var_ratio:^23.4f}")



=== Results Summary ===
Number of Components | Explained Variance Ratio
---------------------------------------------
       2000         |         0.8345         
       1000         |         0.6544         
        500         |         0.4866         
        100         |         0.2092         


#### First 10 Results from each .csv file

| ID | PCA 2000 Components | PCA 1000 Components | PCA 500 Components | PCA 100 Components |
|----|--------------------|--------------------|--------------------|-------------------|
| 17185 | 1 | 1 | 0 | 1 |
| 17186 | 1 | 0 | 0 | 0 |
| 17187 | 1 | 1 | 0 | 0 |
| 17188 | 1 | 0 | 0 | 0 |
| 17189 | 0 | 0 | 0 | 0 |
| 17190 | 1 | 1 | 1 | 1 |
| 17191 | 1 | 0 | 0 | 0 |
| 17192 | 0 | 0 | 0 | 0 |
| 17193 | 0 | 0 | 0 | 0 |
| 17194 | 1 | 1 | 0 | 1 |

### Complete Results Table:

| Number of Components | Explained Variance Ratio | Local Validation F1 Score | **Macro F1 Score (Kaggle)** |
|---------------------|-------------------------|---------------------|-------------------------------------|
| 2000                | 0.8345                  | 0.49788            | **0.38337**                         |
| 1000                | 0.6544                  | 0.55834            | **0.72043**                         |
| 500                 | 0.4866                  | 0.55040            | **0.84799**                         |
| 100                 | 0.2092                  | 0.55751            | **0.81797**                         |