<a href="https://colab.research.google.com/github/Jnanasagara/machine-learning-lab/blob/main/lab7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Optional CatBoost import
try:
    from catboost import CatBoostClassifier
    catboost_available = True
except ImportError:
    catboost_available = False


# Load dataset
csv_path = '/content/drive/MyDrive/ml-stuttering-events-dataset/cleaned-sep28k.csv'
df = pd.read_csv(csv_path)

if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

X = df.drop('Stuttering', axis=1)
y = df['Stuttering']

# Encode target if not numeric
if not np.issubdtype(y.dtype, np.number):
    le = LabelEncoder()
    y = le.fit_transform(y)

# Multiple Classifier Comparison

print("\n=== Multiple Classifier Comparison ===")

# Train-test split for full feature set
X_train, X_test, y_train, y_test = train_test_split(
    X.drop(columns=['0_binned','1_binned'], errors='ignore'),
    y, test_size=0.3, random_state=42, stratify=y
)

# RandomizedSearchCV for Random Forest
param_dist = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}
rf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=5, verbose=1, n_jobs=-1, random_state=42)
rf_random.fit(X_train, y_train)

print("\nBest Random Forest Parameters:", rf_random.best_params_)
print("Best Cross-Validation Score:", rf_random.best_score_)

# Multiple classifiers
classifiers = {
    "SVM": make_pipeline(StandardScaler(), SVC(kernel='rbf', probability=True, random_state=42)),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": rf_random.best_estimator_,
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Naive Bayes": GaussianNB(),
    "MLP": make_pipeline(StandardScaler(), MLPClassifier(max_iter=200, random_state=42))
}
if catboost_available:
    classifiers["CatBoost"] = CatBoostClassifier(verbose=0, random_state=42)

results = []
for name, model in classifiers.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred_test, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred_test, average='weighted', zero_division=0)

    cv_score = rf_random.best_score_ if name == "Random Forest" else np.nan
    results.append([name, train_acc, test_acc, precision, recall, f1, cv_score])

results_df = pd.DataFrame(results, columns=[
    "Classifier","Train Accuracy","Test Accuracy","Precision","Recall","F1-Score","CV Mean Accuracy"
])
print("\nClassification Results:")
print(results_df.to_string(index=False))

best_row = results_df.sort_values("Test Accuracy", ascending=False).iloc[0]
best_model_name = best_row["Classifier"]
best_model = classifiers[best_model_name]
print(f"\nBest Model: {best_model_name}")
print("Classification Report:\n", classification_report(y_test, best_model.predict(X_test)))


=== Multiple Classifier Comparison ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Random Forest Parameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}
Best Cross-Validation Score: 1.0

Training SVM...

Training Decision Tree...

Training Random Forest...

Training AdaBoost...

Training XGBoost...

Training Naive Bayes...

Training MLP...

Classification Results:
   Classifier  Train Accuracy  Test Accuracy  Precision   Recall  F1-Score  CV Mean Accuracy
          SVM         1.00000       0.999678   0.999678 0.999678  0.999678               NaN
Decision Tree         1.00000       1.000000   1.000000 1.000000  1.000000               NaN
Random Forest         1.00000       1.000000   1.000000 1.000000  1.000000               1.0
     AdaBoost         1.00000       1.000000   1.000000 1.000000  1.000000               NaN
      XGBoost         1.00000       1.000000   1.000000 1.000000  1.000000       