In [9]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [3]:
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [5]:
digits_df = pd.DataFrame(digits.data)
digits_df['target'] = digits.target
digits_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [68]:
model_params = {
    "MultinomialNB": {
        "model": MultinomialNB(),
        "parameters": {
            "alpha": [0.1, 1.0, 10.0],
            "fit_prior": [True, False]
        }
    },
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "parameters": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "random_state": [42]
        }
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "parameters": {
            "criterion": ["gini", "entropy"],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5, 10]
        }
    },
    "SVC": {
        "model": SVC(),
        "parameters": {
            "C": [0.1, 1.0, 10.0],
            "kernel": ["rbf", "linear","sigmoid"],
            "gamma": ["scale", "auto"]
        }
    }
}


In [42]:
def get_model(model_params):
    for model_name, params in model_params.items():
        print(model_name, params)
        
get_model(model_params)

MultinomialNB {'parameters': {'alpha': 1.0, 'fit_prior': True}}
RandomForestClassifier {'parameters': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}}
DecisionTreeClassifier {'parameters': {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2}}
SVC {'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'auto'}}


In [43]:
X = digits_df.drop(columns='target')
y = digits_df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_s ize=0.25)
len(X_train)

1347

In [58]:
#Use xgboost model first
from xgboost import XGBRegressor
xgb_model = XGBRegressor(random_state=2024)
svm_model = SVC()

In [59]:
search_space = {
    "n_estimators": [100,200,500],
    "max_depth": [3,6,9],
    "gamma": [0.01,0.1],
    "learning_rate": [0.001,0.01,0.1,1]
}
 
search_sp2 = {
    "C":[0.01,0.1,1],
    "gamma": ["auto", "scale"],
    "kernel": ["linear","rbf","sigmoid"]
}

In [60]:
GS = GridSearchCV(estimator = svm_model, 
                  param_grid = search_sp2,
                  scoring = ["r2","neg_root_mean_squared_error"],
                  refit = "r2",
                  cv = 5,
                  verbose = 4
                 )

In [61]:
GS.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END C=0.01, gamma=auto, kernel=linear; neg_root_mean_squared_error: (test=-0.720) r2: (test=0.937) total time=   0.0s
[CV 2/5] END C=0.01, gamma=auto, kernel=linear; neg_root_mean_squared_error: (test=-0.630) r2: (test=0.952) total time=   0.0s
[CV 3/5] END C=0.01, gamma=auto, kernel=linear; neg_root_mean_squared_error: (test=-0.572) r2: (test=0.960) total time=   0.0s
[CV 4/5] END C=0.01, gamma=auto, kernel=linear; neg_root_mean_squared_error: (test=-0.622) r2: (test=0.953) total time=   0.0s
[CV 5/5] END C=0.01, gamma=auto, kernel=linear; neg_root_mean_squared_error: (test=-0.896) r2: (test=0.903) total time=   0.0s
[CV 1/5] END C=0.01, gamma=auto, kernel=rbf; neg_root_mean_squared_error: (test=-3.246) r2: (test=-0.277) total time=   0.1s
[CV 2/5] END C=0.01, gamma=auto, kernel=rbf; neg_root_mean_squared_error: (test=-3.239) r2: (test=-0.275) total time=   0.1s
[CV 3/5] END C=0.01, gamma=auto, kernel=rbf; neg_root_

[CV 1/5] END C=1, gamma=auto, kernel=rbf; neg_root_mean_squared_error: (test=-2.751) r2: (test=0.082) total time=   0.1s
[CV 2/5] END C=1, gamma=auto, kernel=rbf; neg_root_mean_squared_error: (test=-2.778) r2: (test=0.062) total time=   0.1s
[CV 3/5] END C=1, gamma=auto, kernel=rbf; neg_root_mean_squared_error: (test=-2.645) r2: (test=0.138) total time=   0.1s
[CV 4/5] END C=1, gamma=auto, kernel=rbf; neg_root_mean_squared_error: (test=-2.749) r2: (test=0.072) total time=   0.1s
[CV 5/5] END C=1, gamma=auto, kernel=rbf; neg_root_mean_squared_error: (test=-2.681) r2: (test=0.128) total time=   0.1s
[CV 1/5] END C=1, gamma=auto, kernel=sigmoid; neg_root_mean_squared_error: (test=-3.246) r2: (test=-0.277) total time=   0.0s
[CV 2/5] END C=1, gamma=auto, kernel=sigmoid; neg_root_mean_squared_error: (test=-3.239) r2: (test=-0.275) total time=   0.0s
[CV 3/5] END C=1, gamma=auto, kernel=sigmoid; neg_root_mean_squared_error: (test=-3.224) r2: (test=-0.281) total time=   0.0s
[CV 4/5] END C=1,

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1], 'gamma': ['auto', 'scale'],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             refit='r2', scoring=['r2', 'neg_root_mean_squared_error'],
             verbose=4)

In [62]:
GS.best_estimator_

SVC(C=1)

In [63]:
print(GS.best_params_)
print(GS.best_score_)

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
0.9697884303913076


In [55]:
df_results = pd.DataFrame(GS.cv_results_)
df_results = df_results.sort_values("rank_test_r2")
df_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_r2,...,std_test_r2,rank_test_r2,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,split2_test_neg_root_mean_squared_error,split3_test_neg_root_mean_squared_error,split4_test_neg_root_mean_squared_error,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error
22,0.330316,0.015909,0.039634,0.006097,0.01,0.1,6,200,"{'gamma': 0.01, 'learning_rate': 0.1, 'max_dep...",0.820895,...,0.030228,1,-1.164815,-0.865082,-1.135729,-1.12305,-1.179525,-1.09364,0.116032,1
23,0.495737,0.025656,0.036076,0.00199,0.01,0.1,6,500,"{'gamma': 0.01, 'learning_rate': 0.1, 'max_dep...",0.820895,...,0.030228,2,-1.164815,-0.865082,-1.135729,-1.12305,-1.179525,-1.09364,0.116032,2
21,0.237014,0.020526,0.03529,0.004167,0.01,0.1,6,100,"{'gamma': 0.01, 'learning_rate': 0.1, 'max_dep...",0.820138,...,0.03046,3,-1.167275,-0.869288,-1.139976,-1.131796,-1.18696,-1.099059,0.116551,3
20,0.536857,0.035989,0.038163,0.003178,0.01,0.1,3,500,"{'gamma': 0.01, 'learning_rate': 0.1, 'max_dep...",0.827492,...,0.02988,4,-1.143162,-0.985581,-1.023269,-1.227563,-1.120199,-1.099955,0.086628,4
57,0.185315,0.014963,0.036413,0.003877,0.1,0.1,6,100,"{'gamma': 0.1, 'learning_rate': 0.1, 'max_dept...",0.81249,...,0.031965,5,-1.191834,-0.876829,-1.117104,-1.133143,-1.199705,-1.103723,0.117899,5


# Do it for various models and selection the best model

In [67]:

# Perform grid search for each model
best_models = {}
for model_name, mp in model_params.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(mp["model"], mp["parameters"], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_:.4f}\n")

Running GridSearchCV for MultinomialNB...
Best parameters for MultinomialNB: {'alpha': 0.1, 'fit_prior': False}
Best cross-validation accuracy for MultinomialNB: 0.9027

Running GridSearchCV for RandomForestClassifier...
Best parameters for RandomForestClassifier: {'max_depth': None, 'n_estimators': 200, 'random_state': 42}
Best cross-validation accuracy for RandomForestClassifier: 0.9762

Running GridSearchCV for DecisionTreeClassifier...
Best parameters for DecisionTreeClassifier: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5}
Best cross-validation accuracy for DecisionTreeClassifier: 0.8515

Running GridSearchCV for SVC...
Best parameters for SVC: {'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation accuracy for SVC: 0.9933



In [69]:
from sklearn.metrics import accuracy_score
# Test the best models on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy for {model_name}: {accuracy:.4f}")

Test set accuracy for MultinomialNB: 0.8711
Test set accuracy for RandomForestClassifier: 0.9733
Test set accuracy for DecisionTreeClassifier: 0.8356
Test set accuracy for SVC: 0.9822


The best model for digits classification is SVM