<h2 align='center' style='color:blue'>Finding best model and hyper parameters for sklearn digits dataset classification

In [1]:
#Read Data
import pandas as pd
df = pd.read_csv("/Learning/Python/JupyterNotebooks/py-master/ML/9_decision_tree/salaries.csv")
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [2]:
inputs = df.drop(['salary_more_then_100k'],axis=1)
inputs.head()

Unnamed: 0,company,job,degree
0,google,sales executive,bachelors
1,google,sales executive,masters
2,google,business manager,bachelors
3,google,business manager,masters
4,google,computer programmer,bachelors


In [3]:
target = df['salary_more_then_100k']
target.head()

0    0
1    0
2    1
3    1
4    0
Name: salary_more_then_100k, dtype: int64

In [4]:
# here in input variable all the columns are text so convert it to number using encoder
from sklearn.preprocessing import LabelEncoder
le_company = LabelEncoder()
le_job     = LabelEncoder()
le_degree  = LabelEncoder()
inputs['company_n'] = le_company.fit_transform(inputs['company'])
inputs['job_n']     = le_job.fit_transform(inputs['job'])
inputs['degree_n']  = le_degree.fit_transform(inputs['degree'])
inputs.head()

Unnamed: 0,company,job,degree,company_n,job_n,degree_n
0,google,sales executive,bachelors,2,2,0
1,google,sales executive,masters,2,2,1
2,google,business manager,bachelors,2,0,0
3,google,business manager,masters,2,0,1
4,google,computer programmer,bachelors,2,1,0


In [5]:
# now drop text column
inputs.drop(['company','job','degree'],axis=1,inplace=True)
inputs.head()

Unnamed: 0,company_n,job_n,degree_n
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0


In [6]:
#split data into train test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs,target, test_size=0.20)
print(x_train)
print(x_test)

    company_n  job_n  degree_n
4           2      1         0
10          1      2         0
12          1      0         0
2           2      0         0
9           0      0         1
7           0      1         0
11          1      2         1
8           0      0         0
13          1      0         1
3           2      0         1
5           2      1         1
1           2      2         1
    company_n  job_n  degree_n
6           0      2         1
14          1      1         0
0           2      2         0
15          1      1         1


## GridSearchCV

In [8]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [9]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [10]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df



Unnamed: 0,model,best_score,best_params
0,svm,0.666667,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.7,{'n_estimators': 10}
2,logistic_regression,0.566667,{'C': 10}
3,naive_bayes_gaussian,0.5,{}
4,naive_bayes_multinomial,0.6,{}
5,decision_tree,0.366667,{'criterion': 'gini'}


**For me the winner is random_forest {'n_estimators': 10} with 70% score. It could be different for you as I have limited my parameters to be certain values only**