# Selecting Best Model

In [84]:
import pandas as pd

In [85]:
import warnings
warnings.filterwarnings("ignore")

In [86]:
from sklearn.datasets import load_digits

In [87]:
dataset = load_digits()
dir(dataset)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [88]:
df = pd.DataFrame(data = dataset.data, columns = dataset.feature_names)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [89]:
X = df
X.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [90]:
y = dataset.target
y[:15]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4])

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [92]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [93]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [94]:
model_params = {
    "logistic_regression" : {
        "model" : LogisticRegression(),
        "params" : {
            "C" : [0.1, 1, 10]
        }
    },
    "svc" : {
        "model" : SVC(gamma = "auto"),
        "params" : {
            "C" : [0.1, 1, 10],
            "kernel" : ["linear", "rbf"]
        }
    },
    "random_forest" : {
        "model" : RandomForestClassifier(),
        "params" : {
            "n_estimators" : [10,50,100]
        }
    },
    "decision_tree" : {
        "model" : DecisionTreeClassifier(),
        "params" : {
            "max_depth" : [None, 5, 10]
        }
    },
    "multinomialNB" : {
        "model" : MultinomialNB(),
        "params" : {
            "alpha" : [0.01, 0.1, 1, 10]
        }
    },
    "gaussianNB" : {
        "model" : GaussianNB(),
        "params" : {
            "var_smoothing" : [1e-8, 1e-9, 1e-10]
        }
    }
}

In [95]:
scores = []
for model, model_param in model_params.items():
    clf = GridSearchCV(model_param["model"], model_param["params"], cv = 5)
    clf.fit(X_train, y_train)
    scores.append({
        "model" : model,
        "best_score" : clf.best_score_,
        "best_parameter" : clf.best_params_
    })
scores

[{'model': 'logistic_regression',
  'best_score': 0.9578258394991463,
  'best_parameter': {'C': 10}},
 {'model': 'svc',
  'best_score': 0.9681749193701386,
  'best_parameter': {'C': 0.1, 'kernel': 'linear'}},
 {'model': 'random_forest',
  'best_score': 0.9713811420982736,
  'best_parameter': {'n_estimators': 100}},
 {'model': 'decision_tree',
  'best_score': 0.8241984443179662,
  'best_parameter': {'max_depth': 10}},
 {'model': 'multinomialNB',
  'best_score': 0.9037374312274711,
  'best_parameter': {'alpha': 10}},
 {'model': 'gaussianNB',
  'best_score': 0.8449155757920698,
  'best_parameter': {'var_smoothing': 1e-08}}]

In [109]:
score_df = pd.DataFrame(scores)
score_df

Unnamed: 0,model,best_score,best_parameter
0,logistic_regression,0.957826,{'C': 10}
1,svc,0.968175,"{'C': 0.1, 'kernel': 'linear'}"
2,random_forest,0.971381,{'n_estimators': 100}
3,decision_tree,0.824198,{'max_depth': 10}
4,multinomialNB,0.903737,{'alpha': 10}
5,gaussianNB,0.844916,{'var_smoothing': 1e-08}
