# Machine Learning Model Comparison & Hyperparameter Tuning

This Colab notebook trains multiple classifiers on the UCI Wine dataset, evaluates them with accuracy, precision, recall, and F1-score, and applies GridSearchCV & RandomizedSearchCV to find the best model.

## 1. Setup & Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import randint, uniform
import warnings, json
warnings.filterwarnings('ignore')

## 2. Load Dataset

In [2]:
# Load the dataset
wine = load_wine(as_frame=True)
df = wine.frame
# Save to CSV so the dataset is available as a file (useful when cloning the repo)
df.to_csv('wine.csv', index=False)

X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Dataset shape: {df.shape}, saved as wine.csv')

Dataset shape: (178, 14), saved as wine.csv


## 3. Baseline Models

In [3]:
models = {
    'LogisticRegression': Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=1000))]),
    'RandomForest': RandomForestClassifier(random_state=42),
    'SVC': Pipeline([('scaler', StandardScaler()), ('clf', SVC())])
}

metrics = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted')
    }

print('Baseline performance:')
print(pd.DataFrame(metrics).T)

Baseline performance:
                    accuracy  precision    recall       f1
LogisticRegression  0.972222   0.974074  0.972222  0.97197
RandomForest        1.000000   1.000000  1.000000  1.00000
SVC                 0.972222   0.974074  0.972222  0.97197


## 4. Hyperparameter Tuning

In [4]:
# 4.1 GridSearchCV for RandomForest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, n_jobs=-1, scoring='f1_weighted')
grid_rf.fit(X_train, y_train)
print('Best RF params:', grid_rf.best_params_)
best_rf = grid_rf.best_estimator_

Best RF params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}


In [5]:
# 4.2 RandomizedSearchCV for SVC
param_dist_svc = {
    'clf__C': uniform(0.1, 10),
    'clf__gamma': uniform(0.001, 0.1),
    'clf__kernel': ['rbf']
}

rand_svc = RandomizedSearchCV(
    Pipeline([('scaler', StandardScaler()), ('clf', SVC())]),
    param_distributions=param_dist_svc,
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1,
    scoring='f1_weighted'
)
rand_svc.fit(X_train, y_train)
print('Best SVC params:', rand_svc.best_params_)
best_svc = rand_svc.best_estimator_

Best SVC params: {'clf__C': np.float64(4.419450186421157), 'clf__gamma': np.float64(0.030122914019804194), 'clf__kernel': 'rbf'}


## 5. Compare Tuned Models

In [7]:
tuned_models = {'BestRandomForest': best_rf, 'BestSVC': best_svc}
for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    metrics[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted')
    }

results_df = pd.DataFrame(metrics).T.sort_values('f1', ascending=False)
print('All model performances:')
print(results_df)

best_model_name = results_df.index[0]
print(f'Selected best model: {best_model_name}')

All model performances:
                    accuracy  precision    recall       f1
RandomForest        1.000000   1.000000  1.000000  1.00000
BestRandomForest    1.000000   1.000000  1.000000  1.00000
LogisticRegression  0.972222   0.974074  0.972222  0.97197
SVC                 0.972222   0.974074  0.972222  0.97197
BestSVC             0.944444   0.951389  0.944444  0.94321
Selected best model: RandomForest


## 6. Save Best Model (Optional)

In [9]:
import joblib

# Check if the best model is in the tuned_models dictionary, otherwise check the original models
if best_model_name in tuned_models:
    joblib.dump(tuned_models[best_model_name], f'{best_model_name}.joblib')
elif best_model_name in models:
    joblib.dump(models[best_model_name], f'{best_model_name}.joblib')
else:
    print(f"Model '{best_model_name}' not found in tuned_models or models dictionaries.")

print('Model saved!')

Model saved!
