In [1]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS6_S7N_TranHoangBach/BaiTap/Chapter_9_KyThuatBoSung'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS6_S7N_TranHoangBach/BaiTap/Chapter_9_KyThuatBoSung


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import pandas_profiling as pp
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [4]:
df = pd.read_excel('Iris.xls')
label_encoder = LabelEncoder()
df['iris'] = label_encoder.fit_transform(df['iris'])
df.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,iris
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
X = df.drop(columns=['iris'])
y = df['iris']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

list_model = [
              ('Logistic Regression', LogisticRegression(solver='newton-cg')),
              ('Naive Bayes', GaussianNB()),
              ('KNN', KNeighborsClassifier(n_neighbors=3)),
              ('Random Forest', RandomForestClassifier()),
              ('Gradient Boosting', GradientBoostingClassifier()),
]


def result_model_cv(list_model, X, y, cv=10):
    from sklearn.model_selection import cross_validate
    result = []
    for name, model in list_model:
        cv_result = cross_validate(model, X, y, cv=cv, return_train_score=True, scoring=['accuracy'])
        result.append([name, 
                       cv_result['train_accuracy'].mean(), 
                       cv_result['test_accuracy'].mean(), 
                       np.abs(cv_result['train_accuracy'].mean() - cv_result['test_accuracy'].mean()),
                       cv_result['train_accuracy'].std(),
                       cv_result['test_accuracy'].std(),
        ])
    result = pd.DataFrame(result, columns=['model', 'train_accuracy', 'test_accuracy', 'diff', 'train_acc_std', 'test_acc_std'])
    result.sort_values('test_accuracy', ascending=False, inplace=True)
    model_best = result.iloc[0, 0]
    print('Best method using cv is: ', model_best)
    print('Note: Result just from simple model, NOT from fine-tuning best model')
    display(result)
result_model_cv(list_model, X_train, y_train, cv=15)

Best method using cv is:  Logistic Regression
Note: Result just from simple model, NOT from fine-tuning best model


Unnamed: 0,model,train_accuracy,test_accuracy,diff,train_acc_std,test_acc_std
0,Logistic Regression,0.967479,0.965476,0.002002,0.006805,0.073328
2,KNN,0.950904,0.955952,0.005049,0.009741,0.062497
1,Naive Bayes,0.943877,0.946429,0.002552,0.005919,0.080125
3,Random Forest,1.0,0.938095,0.061905,0.0,0.080619
4,Gradient Boosting,1.0,0.938095,0.061905,0.0,0.080619


In [24]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=list_model, voting='soft')
voting_clf.fit(X_train, y_train)
print('Accuracy Train: {:.4f}'.format(voting_clf.score(X_train, y_train)))
print('Accuracy Test: {:.4f}'.format(voting_clf.score(X_test, y_test)))

Accuracy Train: 1.0000
Accuracy Test: 1.0000


In [48]:
from sklearn.model_selection import GridSearchCV
import time
params = {
    'n_estimators' : [10, 20, 30, 50, 100, 200],
    'max_features' : ['sqrt', 'log2', None],
    'max_depth': [4, 5, 6],
}
tic = time.time()
grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params, cv=5)
grid.fit(X_train, y_train)
print('Best parameters: ', grid.best_params_)
toc = time.time()
print('Total time running: {:.2f} seconds'.format(toc-tic))
best_model = grid.best_estimator_
print('Accuracy train: ', best_model.score(X_train, y_train))
print('Accuracy test: ', best_model.score(X_test, y_test))

Best parameters:  {'max_depth': 4, 'max_features': 'log2', 'n_estimators': 200}
Total time running: 26.85 seconds
Accuracy train:  0.9821428571428571
Accuracy test:  1.0


In [49]:
from sklearn.model_selection import RandomizedSearchCV
import time
params = {
    'n_estimators' : [10, 20, 30, 50, 100, 200],
    'max_features' : ['sqrt', 'log2', None],
    'max_depth': [4, 5, 6],
}
tic = time.time()
grid = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=params, cv=5)
grid.fit(X_train, y_train)
print('Best parameters: ', grid.best_params_)
toc = time.time()
print('Total time running: {:.2f} seconds'.format(toc-tic))
best_model = grid.best_estimator_
print('Accuracy train: ', best_model.score(X_train, y_train))
print('Accuracy test: ', best_model.score(X_test, y_test))

Best parameters:  {'n_estimators': 30, 'max_features': 'sqrt', 'max_depth': 4}
Total time running: 2.87 seconds
Accuracy train:  0.9821428571428571
Accuracy test:  1.0
