# Classifiers Applied to Automobile MPG Dataset

Auto MPG Dataset: https://archive.ics.uci.edu/dataset/9/auto+mpg

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load and preprocess the data
file_path = 'auto-mpg.data'
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
data = pd.read_csv(file_path, delim_whitespace=True, names=columns)

data['horsepower'].replace('?', np.nan, inplace=True)
data.dropna(subset=['horsepower'], inplace=True)
data['horsepower'] = data['horsepower'].astype(float)

median_mpg = data['mpg'].median()
data['mpg_high'] = (data['mpg'] > median_mpg).astype(int)
data.drop(['mpg', 'car name'], axis=1, inplace=True)

X = data.drop('mpg_high', axis=1)
Y = data['mpg_high']

In [3]:
# Function for running trials
def run_trials(X, Y, classifier, param_grid, test_sizes):
    results = []
    for test_size in test_sizes:
        for trial in range(3):  # 3 trials
            # Data splitting and scaling
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Hyperparameter tuning
            grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
            grid_search.fit(X_train_scaled, Y_train)

            best_model = grid_search.best_estimator_
            train_accuracy = accuracy_score(Y_train, best_model.predict(X_train_scaled))
            test_accuracy = accuracy_score(Y_test, best_model.predict(X_test_scaled))
            results.append((train_accuracy, test_accuracy, grid_search.best_params_))
            # Averaging results
    avg_train_accuracy = np.mean([result[0] for result in results])
    avg_test_accuracy = np.mean([result[1] for result in results])
    best_params = results[np.argmax([result[1] for result in results])][2]

    return avg_train_accuracy, avg_test_accuracy, best_params

In [4]:
# Parameter grids for classifiers
param_grid_rf = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
param_grid_svm = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]}
param_grid_knn = {'n_neighbors': [3, 5, 7]}
param_grid_dt = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}

In [5]:
# Running trials for each classifier
test_sizes = [0.2, 0.5, 0.8]
classifiers = {
    "Random Forest": (RandomForestClassifier(random_state=42), param_grid_rf),
    "SVM": (SVC(random_state=42), param_grid_svm),
    "KNN": (KNeighborsClassifier(), param_grid_knn),
    "Decision Tree": (DecisionTreeClassifier(random_state=42), param_grid_dt)
}

In [6]:
final_results = {}
for name, (classifier, param_grid) in classifiers.items():
    train_acc, test_acc, best_params = run_trials(X, Y, classifier, param_grid, test_sizes)
    final_results[name] = {
        "Average Train Accuracy": train_acc, 
        "Average Test Accuracy": test_acc, 
        "Best Params": best_params
    }

In [7]:
# Display final results
for classifier, results in final_results.items():
    print(f"{classifier} Classifier Results:")
    for key, value in results.items():
        print(f"{key}: {value}")
    print()

Random Forest Classifier Results:
Average Train Accuracy: 1.0
Average Test Accuracy: 0.9004714681163927
Best Params: {'max_depth': None, 'n_estimators': 100}

SVM Classifier Results:
Average Train Accuracy: 0.9701673902951858
Average Test Accuracy: 0.8520062624743521
Best Params: {'C': 0.1, 'gamma': 0.1}

KNN Classifier Results:
Average Train Accuracy: 0.9475720410225202
Average Test Accuracy: 0.89907889302621
Best Params: {'n_neighbors': 7}

Decision Tree Classifier Results:
Average Train Accuracy: 0.9850931689133423
Average Test Accuracy: 0.8698848643706456
Best Params: {'max_depth': None, 'min_samples_split': 10}

