# IEE 520 ML Project - Binary Classification

**Name:** Harsha Koushik Teja Aila  
**Date:** January 2025

## Goal
Build a classifier to predict binary labels on imbalanced data. Need to minimize Balanced Error Rate (BER).

## Dataset
- 10,000 labeled samples for training
- 10,000 unlabeled for predictions
- 21 features (mix of ordinal, numerical, binary)

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

## Load and explore data

In [None]:
# load datasets
labeled = pd.read_excel("ProjectLABELED2025.xlsx")
unlabeled = pd.read_excel("ProjectNOTLABELED2025.xlsx")

print(f"Labeled shape: {labeled.shape}")
print(f"Unlabeled shape: {unlabeled.shape}")

In [None]:
labeled.head()

In [None]:
labeled.info()

In [None]:
# check class balance
print(labeled['label'].value_counts())
print("\nClass distribution:")
print(labeled['label'].value_counts(normalize=True))

In [None]:
# visualize class imbalance
plt.figure(figsize=(8, 5))
labeled['label'].value_counts().plot(kind='bar', color=['steelblue', 'coral'])
plt.title('Class Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# check for missing values
print("Missing values per column:")
print(labeled.isnull().sum())

In [None]:
# save index columns and drop from working data
labeled_index = labeled["Unnamed: 0"]
unlabeled_index = unlabeled["Unnamed: 0"]

labeled = labeled.drop(columns=["Unnamed: 0"])
unlabeled_clean = unlabeled.drop(columns=["Unnamed: 0"])

## Feature Engineering

Based on the project description:
- x2, x3, x4 are ordinal
- x15-x21 are numerical
- Rest are binary

In [None]:
# define feature groups
ordinal_cols = ["x2", "x3", "x4"]
numeric_cols = ["x15","x16","x17","x18","x19","x20","x21"]
binary_cols = [c for c in labeled.columns if c not in ordinal_cols + numeric_cols + ["label"]]

print(f"Ordinal: {ordinal_cols}")
print(f"Numeric: {numeric_cols}")
print(f"Binary: {binary_cols}")

In [None]:
# quick look at numerical features
labeled[numeric_cols].describe()

In [None]:
# correlation matrix for numeric features
plt.figure(figsize=(10, 8))
sns.heatmap(labeled[numeric_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix - Numerical Features')
plt.tight_layout()
plt.show()

## Preprocessing Pipeline

Using ColumnTransformer to handle different feature types:
- Ordinal: impute with mode, then ordinal encode
- Numerical: impute with mean, then standardize
- Binary: just impute with mode

In [None]:
preprocessor = ColumnTransformer([
    ("ord", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder())
    ]), ordinal_cols),

    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]), numeric_cols),

    ("bin", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent"))
    ]), binary_cols),
])

## Train/Val Split

In [None]:
X = labeled.drop(columns=["label"])
y = labeled["label"]

# stratified split to maintain class balance
X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

print(f"Train size: {X_train.shape[0]}")
print(f"Val size: {X_val.shape[0]}")
print(f"\nTrain class dist: {y_train.value_counts().to_dict()}")
print(f"Val class dist: {y_val.value_counts().to_dict()}")

In [None]:
# setup CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Model Training

Trying 3 models:
1. Random Forest
2. SVM with RBF kernel
3. Logistic Regression (baseline)

Using GridSearchCV to tune hyperparameters. Scoring with balanced_accuracy since data is imbalanced.

In [None]:
# define models and param grids
models = {
    "RandomForest": (
        Pipeline([("prep", preprocessor), ("clf", RandomForestClassifier(random_state=42))]),
        {
            "clf__n_estimators": [200, 400],
            "clf__max_depth": [None, 20],
            "clf__class_weight": [None, "balanced"]
        }
    ),
    "SVM_RBF": (
        Pipeline([("prep", preprocessor), ("clf", SVC())]),
        {
            "clf__kernel": ["rbf"],
            "clf__C": [1.0],
            "clf__gamma": ["scale"],
            "clf__class_weight": ["balanced"]
        }
    ),
    "LogisticRegression": (
        Pipeline([("prep", preprocessor), ("clf", LogisticRegression(max_iter=500))]),
        {
            "clf__C": [1.0],
            "clf__class_weight": ["balanced"]
        }
    )
}

In [None]:
# train and evaluate
results = {}

for name, (pipe, grid) in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print(f"{'='*50}")
    
    grid_search = GridSearchCV(
        pipe, grid, scoring="balanced_accuracy",
        cv=cv, n_jobs=-1, verbose=2
    )
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    
    ba = balanced_accuracy_score(y_val, y_pred)
    ber = 1 - ba
    
    results[name] = {
        "best_cv": grid_search.best_score_,
        "val_bal_acc": ba,
        "val_BER": ber,
        "confusion_matrix": confusion_matrix(y_val, y_pred),
        "model": best_model
    }
    
    print(f"\nBest CV Score: {grid_search.best_score_:.4f}")
    print(f"Val Balanced Accuracy: {ba:.4f}")
    print(f"Val BER: {ber:.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_pred)}")

## Model Comparison

In [None]:
# compare all models
comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'CV Bal Acc': [results[m]['best_cv'] for m in results.keys()],
    'Val Bal Acc': [results[m]['val_bal_acc'] for m in results.keys()],
    'Val BER': [results[m]['val_BER'] for m in results.keys()]
}).sort_values('Val BER')

print(comparison)

In [None]:
# visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(comparison))
width = 0.35

ax.bar(x - width/2, comparison['CV Bal Acc'], width, label='CV Bal Acc', alpha=0.8)
ax.bar(x + width/2, comparison['Val Bal Acc'], width, label='Val Bal Acc', alpha=0.8)

ax.set_xlabel('Model')
ax.set_ylabel('Balanced Accuracy')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(comparison['Model'])
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# select best model (lowest BER)
best_name = min(results, key=lambda x: results[x]["val_BER"])
final_model = results[best_name]["model"]

print(f"\nBest model: {best_name}")
print(f"Val BER: {results[best_name]['val_BER']:.4f}")

In [None]:
# confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (name, result) in enumerate(results.items()):
    cm = result['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx], cbar=False)
    axes[idx].set_title(f"{name}\nBER: {result['val_BER']:.4f}")
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## Final Predictions

Retrain best model on full dataset and generate predictions for unlabeled data

In [None]:
# retrain on full labeled data
print(f"Retraining {best_name} on full dataset...")
final_model.fit(X, y)
print("Done!")

In [None]:
# predict on unlabeled data
final_predictions = final_model.predict(unlabeled_clean)

print(f"Generated {len(final_predictions)} predictions")
print(f"\nPrediction distribution:")
print(pd.Series(final_predictions).value_counts())

In [None]:
# create submission file
submission = pd.DataFrame({
    "index": unlabeled_index,
    "label": final_predictions
})

submission.head()

In [None]:
# save to csv
os.makedirs("output", exist_ok=True)

output_filename = "output/ProjectPredictions2025HarshaKoushikTejaAila.csv"
submission.to_csv(output_filename, index=False)

print(f"Saved predictions to {output_filename}")


## Summary

- Tested 3 models with grid search
- SVM with RBF kernel performed best
- Used balanced accuracy to handle class imbalance
- Final predictions saved for submission

Possible improvements:
- Try more hyperparameter values
- Feature selection
- Try ensemble methods (XGBoost, LightGBM)
- SMOTE for handling imbalance