In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md
from matplotlib import pyplot as plt
from IPython.display import clear_output, display_html 
from collections import Counter

In [3]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [4]:
testing = pd.read_csv("tanzanian_water_wells/X_test.csv")
X = pd.read_csv("tanzanian_water_wells/X_train.csv")
y = pd.read_csv("tanzanian_water_wells/y_train.csv")['status_group'].map({'functional': 2, 'functional needs repair': 0, 'non functional': 1})

In [5]:
strategy = {1: 20000} #optional strategy
# smote=SMOTE(sampling_strategy=strategy)
smote = SMOTE()

In [6]:
#Dropping unecessary columns
X = X.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)
testing = testing.drop(['id', 'wpt_name', 'subvillage', 'installer', 'funder', 'scheme_name', 'ward', 'date_recorded', 'recorded_by'], axis=1)

# Eliminating null values from X_train
X.scheme_management.fillna("None", inplace=True)
X.permit.fillna('Unknown', inplace=True)
X.public_meeting.fillna('Unknown', inplace=True)

# Turning certain dtypes into others
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
X['population'] = X['population'].astype('float64')
X['construction_year'] = X['construction_year'].astype('int64')
X['region_code'] = X['region_code'].astype('str')
X['district_code'] = X['district_code'].astype('str')

#Defining X_cat
X_cat = X.select_dtypes(exclude=['float64', 'int64'])
X_cat = X_cat.astype('str')
X_cat = pd.get_dummies(X_cat)

#Defining X_numeric
X_numeric = X.select_dtypes(['float64', 'int64'])
scaler = StandardScaler()
scaler.fit(X_numeric)
X_numeric = pd.DataFrame(scaler.transform(X_numeric), columns = X_numeric.columns, index = X_numeric.index)

#Defining X
X = pd.concat([X_numeric, X_cat], axis=1)

#Defining df
df = pd.concat([X, y], axis=1)

#Creating a train-test-split for X and y
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Resampling dataframes for model creation
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_train_resampled = X_train_resampled.reset_index(drop=True)
y_train_resampled = y_train_resampled.reset_index(drop=True)
    
df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)

#Separating concatenated dataframe into each status group
f = df_resampled[df_resampled.status_group == 0].reset_index(drop=True).copy()
nf = df_resampled[df_resampled.status_group == 2].reset_index(drop=True).copy()
fnr = df_resampled[df_resampled.status_group == 1].reset_index(drop=True).copy()

#Shuffling all the records
f = f.sample(frac=1)
nf = nf.sample(frac=1)
fnr = fnr.sample(frac=1)

#Splitting each status group into arrays of approximately 1,000 records
fs = [pd.DataFrame(i) for i in np.array_split(f, len(f)//1000)]
nfs = [pd.DataFrame(i) for i in np.array_split(nf, len(nf)//1000)]
fnrs = [pd.DataFrame(i) for i in np.array_split(fnr, len(fnr)//1000)]

#Creating our individual samples for models to train on
resamples = []
for i in range(len(fnrs)):
    resample = pd.concat([fs[i], nfs[i], fnrs[i]])
    resamples.append(resample)

In [7]:
estimators = {'Logistic Regression': LogisticRegression(solver='liblinear', fit_intercept=False), 
              'Decision Tree Classifier': DecisionTreeClassifier(), 
              'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=3), 
              'Bagging Classifier': BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50), 
              'Random Forest': RandomForestClassifier(), 
              'XG-Boost': XGBClassifier(), 
              'Adaboost': AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42), 
              'Gradient Boosted Trees': GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50), 
              'Extra Randomized Trees': ExtraTreesClassifier(n_estimators=100, random_state=42), 
              'Stacking Classifier': StackingClassifier(
        estimators = [('logistic_regression', LogisticRegression(solver='liblinear', fit_intercept=False)), 
                      ('bagging_classifier', BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50))], 
        final_estimator = XGBClassifier())}

In [None]:
dtc = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [5, 10, 20, 40],
    'min_samples_leaf': [5, 10, 20],
    'splitter': ['best', 'random']
}

gs_tree = GridSearchCV(dtc, param_grid, cv=3, verbose=10)
gs_tree.best_params_

In [40]:
estimator = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [5, 10, 20, 40],
    'min_samples_leaf': [5, 10, 20],
    'splitter': ['best', 'random']
}

estimator = GridSearchCV(estimator, param_grid, cv=5, verbose=10)
estimator.fit(X_train, y_train)
estimator.best_params_

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV 1/5; 1/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=best
[CV 1/5; 1/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=best;, score=0.639 total time=   0.1s
[CV 2/5; 1/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=best
[CV 2/5; 1/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=best;, score=0.644 total time=   0.1s
[CV 3/5; 1/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=best
[CV 3/5; 1/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=best;, score=0.643 total time=   0.1s
[CV 4/5; 1/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=best
[CV 4/5; 1/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=5, splitter=bes

[CV 5/5; 7/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=best;, score=0.644 total time=   0.1s
[CV 1/5; 8/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=random
[CV 1/5; 8/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=random;, score=0.639 total time=   0.1s
[CV 2/5; 8/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=random
[CV 2/5; 8/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=random;, score=0.644 total time=   0.1s
[CV 3/5; 8/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=random
[CV 3/5; 8/192] END criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=random;, score=0.643 total time=   0.1s
[CV 4/5; 8/192] START criterion=gini, max_depth=1, min_samples_leaf=5, min_samples_split=40, splitter=random
[CV 4/

[CV 4/5; 14/192] END criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=20, splitter=random;, score=0.642 total time=   0.1s
[CV 5/5; 14/192] START criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=20, splitter=random
[CV 5/5; 14/192] END criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=20, splitter=random;, score=0.644 total time=   0.1s
[CV 1/5; 15/192] START criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=40, splitter=best
[CV 1/5; 15/192] END criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=40, splitter=best;, score=0.639 total time=   0.1s
[CV 2/5; 15/192] START criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=40, splitter=best
[CV 2/5; 15/192] END criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=40, splitter=best;, score=0.644 total time=   0.1s
[CV 3/5; 15/192] START criterion=gini, max_depth=1, min_samples_leaf=10, min_samples_split=40, splitter=bes

[CV 2/5; 21/192] END criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.644 total time=   0.1s
[CV 3/5; 21/192] START criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=best
[CV 3/5; 21/192] END criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.643 total time=   0.1s
[CV 4/5; 21/192] START criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=best
[CV 4/5; 21/192] END criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.642 total time=   0.1s
[CV 5/5; 21/192] START criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=best
[CV 5/5; 21/192] END criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.644 total time=   0.1s
[CV 1/5; 22/192] START criterion=gini, max_depth=1, min_samples_leaf=20, min_samples_split=20, splitter=random
[C

[CV 5/5; 27/192] END criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=best;, score=0.699 total time=   0.1s
[CV 1/5; 28/192] START criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=random
[CV 1/5; 28/192] END criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=random;, score=0.691 total time=   0.1s
[CV 2/5; 28/192] START criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=random
[CV 2/5; 28/192] END criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=random;, score=0.695 total time=   0.1s
[CV 3/5; 28/192] START criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=random
[CV 3/5; 28/192] END criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=random;, score=0.689 total time=   0.1s
[CV 4/5; 28/192] START criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=10, splitter=rando

[CV 3/5; 34/192] END criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=5, splitter=random;, score=0.689 total time=   0.1s
[CV 4/5; 34/192] START criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=5, splitter=random
[CV 4/5; 34/192] END criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=5, splitter=random;, score=0.694 total time=   0.1s
[CV 5/5; 34/192] START criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=5, splitter=random
[CV 5/5; 34/192] END criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=5, splitter=random;, score=0.699 total time=   0.1s
[CV 1/5; 35/192] START criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=10, splitter=best
[CV 1/5; 35/192] END criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=10, splitter=best;, score=0.691 total time=   0.1s
[CV 2/5; 35/192] START criterion=gini, max_depth=2, min_samples_leaf=10, min_samples_split=10, splitter=best

[CV 1/5; 41/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best;, score=0.691 total time=   0.1s
[CV 2/5; 41/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best
[CV 2/5; 41/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best;, score=0.695 total time=   0.1s
[CV 3/5; 41/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best
[CV 3/5; 41/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best;, score=0.689 total time=   0.1s
[CV 4/5; 41/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best
[CV 4/5; 41/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best;, score=0.693 total time=   0.1s
[CV 5/5; 41/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=5, splitter=best
[CV 5/5; 41/

[CV 5/5; 47/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitter=best;, score=0.699 total time=   0.1s
[CV 1/5; 48/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitter=random
[CV 1/5; 48/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitter=random;, score=0.691 total time=   0.1s
[CV 2/5; 48/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitter=random
[CV 2/5; 48/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitter=random;, score=0.695 total time=   0.1s
[CV 3/5; 48/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitter=random
[CV 3/5; 48/192] END criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitter=random;, score=0.689 total time=   0.1s
[CV 4/5; 48/192] START criterion=gini, max_depth=2, min_samples_leaf=20, min_samples_split=40, splitt

[CV 3/5; 54/192] END criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=20, splitter=random;, score=0.701 total time=   0.2s
[CV 4/5; 54/192] START criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=20, splitter=random
[CV 4/5; 54/192] END criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=20, splitter=random;, score=0.707 total time=   0.2s
[CV 5/5; 54/192] START criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=20, splitter=random
[CV 5/5; 54/192] END criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=20, splitter=random;, score=0.719 total time=   0.2s
[CV 1/5; 55/192] START criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=40, splitter=best
[CV 1/5; 55/192] END criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=40, splitter=best;, score=0.709 total time=   0.2s
[CV 2/5; 55/192] START criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=40, splitter=best
[C

[CV 1/5; 61/192] END criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best;, score=0.709 total time=   0.2s
[CV 2/5; 61/192] START criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best
[CV 2/5; 61/192] END criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best;, score=0.709 total time=   0.2s
[CV 3/5; 61/192] START criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best
[CV 3/5; 61/192] END criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best;, score=0.709 total time=   0.2s
[CV 4/5; 61/192] START criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best
[CV 4/5; 61/192] END criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best;, score=0.706 total time=   0.2s
[CV 5/5; 61/192] START criterion=gini, max_depth=5, min_samples_leaf=10, min_samples_split=20, splitter=best
[CV 

[CV 4/5; 67/192] END criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=best;, score=0.707 total time=   0.2s
[CV 5/5; 67/192] START criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=best
[CV 5/5; 67/192] END criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=best;, score=0.721 total time=   0.2s
[CV 1/5; 68/192] START criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=random
[CV 1/5; 68/192] END criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=random;, score=0.704 total time=   0.2s
[CV 2/5; 68/192] START criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=random
[CV 2/5; 68/192] END criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=random;, score=0.710 total time=   0.2s
[CV 3/5; 68/192] START criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=10, splitter=r

[CV 2/5; 74/192] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=5, splitter=random;, score=0.743 total time=   0.3s
[CV 3/5; 74/192] START criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=5, splitter=random
[CV 3/5; 74/192] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=5, splitter=random;, score=0.738 total time=   0.3s
[CV 4/5; 74/192] START criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=5, splitter=random
[CV 4/5; 74/192] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=5, splitter=random;, score=0.731 total time=   0.3s
[CV 5/5; 74/192] START criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=5, splitter=random
[CV 5/5; 74/192] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=5, splitter=random;, score=0.740 total time=   0.3s
[CV 1/5; 75/192] START criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=10, splitter=be

[CV 5/5; 80/192] END criterion=gini, max_depth=10, min_samples_leaf=5, min_samples_split=40, splitter=random;, score=0.740 total time=   0.3s
[CV 1/5; 81/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=5, splitter=best
[CV 1/5; 81/192] END criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=5, splitter=best;, score=0.735 total time=   0.4s
[CV 2/5; 81/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=5, splitter=best
[CV 2/5; 81/192] END criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=5, splitter=best;, score=0.742 total time=   0.4s
[CV 3/5; 81/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=5, splitter=best
[CV 3/5; 81/192] END criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=5, splitter=best;, score=0.737 total time=   0.4s
[CV 4/5; 81/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=5, splitter=best
[C

[CV 3/5; 87/192] END criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitter=best;, score=0.736 total time=   0.4s
[CV 4/5; 87/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitter=best
[CV 4/5; 87/192] END criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitter=best;, score=0.739 total time=   0.4s
[CV 5/5; 87/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitter=best
[CV 5/5; 87/192] END criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitter=best;, score=0.742 total time=   0.4s
[CV 1/5; 88/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitter=random
[CV 1/5; 88/192] END criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitter=random;, score=0.736 total time=   0.4s
[CV 2/5; 88/192] START criterion=gini, max_depth=10, min_samples_leaf=10, min_samples_split=40, splitt

[CV 1/5; 94/192] END criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=20, splitter=random;, score=0.737 total time=   0.3s
[CV 2/5; 94/192] START criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=20, splitter=random
[CV 2/5; 94/192] END criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=20, splitter=random;, score=0.735 total time=   0.3s
[CV 3/5; 94/192] START criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=20, splitter=random
[CV 3/5; 94/192] END criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=20, splitter=random;, score=0.731 total time=   0.3s
[CV 4/5; 94/192] START criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=20, splitter=random
[CV 4/5; 94/192] END criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=20, splitter=random;, score=0.723 total time=   0.3s
[CV 5/5; 94/192] START criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=

[CV 2/5; 101/192] END criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.644 total time=   0.1s
[CV 3/5; 101/192] START criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_split=20, splitter=best
[CV 3/5; 101/192] END criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.643 total time=   0.1s
[CV 4/5; 101/192] START criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_split=20, splitter=best
[CV 4/5; 101/192] END criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.642 total time=   0.1s
[CV 5/5; 101/192] START criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_split=20, splitter=best
[CV 5/5; 101/192] END criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.644 total time=   0.1s
[CV 1/5; 102/192] START criterion=entropy, max_depth=1, min_samples_leaf=5, min_samples_spli

[CV 1/5; 108/192] END criterion=entropy, max_depth=1, min_samples_leaf=10, min_samples_split=10, splitter=random;, score=0.639 total time=   0.1s
[CV 2/5; 108/192] START criterion=entropy, max_depth=1, min_samples_leaf=10, min_samples_split=10, splitter=random
[CV 2/5; 108/192] END criterion=entropy, max_depth=1, min_samples_leaf=10, min_samples_split=10, splitter=random;, score=0.644 total time=   0.1s
[CV 3/5; 108/192] START criterion=entropy, max_depth=1, min_samples_leaf=10, min_samples_split=10, splitter=random
[CV 3/5; 108/192] END criterion=entropy, max_depth=1, min_samples_leaf=10, min_samples_split=10, splitter=random;, score=0.643 total time=   0.1s
[CV 4/5; 108/192] START criterion=entropy, max_depth=1, min_samples_leaf=10, min_samples_split=10, splitter=random
[CV 4/5; 108/192] END criterion=entropy, max_depth=1, min_samples_leaf=10, min_samples_split=10, splitter=random;, score=0.642 total time=   0.1s
[CV 5/5; 108/192] START criterion=entropy, max_depth=1, min_samples_lea

[CV 3/5; 114/192] END criterion=entropy, max_depth=1, min_samples_leaf=20, min_samples_split=5, splitter=random;, score=0.643 total time=   0.1s
[CV 4/5; 114/192] START criterion=entropy, max_depth=1, min_samples_leaf=20, min_samples_split=5, splitter=random
[CV 4/5; 114/192] END criterion=entropy, max_depth=1, min_samples_leaf=20, min_samples_split=5, splitter=random;, score=0.642 total time=   0.1s
[CV 5/5; 114/192] START criterion=entropy, max_depth=1, min_samples_leaf=20, min_samples_split=5, splitter=random
[CV 5/5; 114/192] END criterion=entropy, max_depth=1, min_samples_leaf=20, min_samples_split=5, splitter=random;, score=0.644 total time=   0.1s
[CV 1/5; 115/192] START criterion=entropy, max_depth=1, min_samples_leaf=20, min_samples_split=10, splitter=best
[CV 1/5; 115/192] END criterion=entropy, max_depth=1, min_samples_leaf=20, min_samples_split=10, splitter=best;, score=0.639 total time=   0.1s
[CV 2/5; 115/192] START criterion=entropy, max_depth=1, min_samples_leaf=20, min

[CV 2/5; 121/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, splitter=best;, score=0.695 total time=   0.1s
[CV 3/5; 121/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, splitter=best
[CV 3/5; 121/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, splitter=best;, score=0.689 total time=   0.1s
[CV 4/5; 121/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, splitter=best
[CV 4/5; 121/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, splitter=best;, score=0.694 total time=   0.1s
[CV 5/5; 121/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, splitter=best
[CV 5/5; 121/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, splitter=best;, score=0.699 total time=   0.1s
[CV 1/5; 122/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, sp

[CV 5/5; 127/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=40, splitter=best;, score=0.699 total time=   0.1s
[CV 1/5; 128/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=40, splitter=random
[CV 1/5; 128/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=40, splitter=random;, score=0.691 total time=   0.1s
[CV 2/5; 128/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=40, splitter=random
[CV 2/5; 128/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=40, splitter=random;, score=0.695 total time=   0.1s
[CV 3/5; 128/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=40, splitter=random
[CV 3/5; 128/192] END criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=40, splitter=random;, score=0.689 total time=   0.1s
[CV 4/5; 128/192] START criterion=entropy, max_depth=2, min_samples_leaf=5, min_

[CV 3/5; 134/192] END criterion=entropy, max_depth=2, min_samples_leaf=10, min_samples_split=20, splitter=random;, score=0.689 total time=   0.1s
[CV 4/5; 134/192] START criterion=entropy, max_depth=2, min_samples_leaf=10, min_samples_split=20, splitter=random
[CV 4/5; 134/192] END criterion=entropy, max_depth=2, min_samples_leaf=10, min_samples_split=20, splitter=random;, score=0.694 total time=   0.1s
[CV 5/5; 134/192] START criterion=entropy, max_depth=2, min_samples_leaf=10, min_samples_split=20, splitter=random
[CV 5/5; 134/192] END criterion=entropy, max_depth=2, min_samples_leaf=10, min_samples_split=20, splitter=random;, score=0.699 total time=   0.1s
[CV 1/5; 135/192] START criterion=entropy, max_depth=2, min_samples_leaf=10, min_samples_split=40, splitter=best
[CV 1/5; 135/192] END criterion=entropy, max_depth=2, min_samples_leaf=10, min_samples_split=40, splitter=best;, score=0.691 total time=   0.1s
[CV 2/5; 135/192] START criterion=entropy, max_depth=2, min_samples_leaf=10

[CV 1/5; 141/192] END criterion=entropy, max_depth=2, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.691 total time=   0.1s
[CV 2/5; 141/192] START criterion=entropy, max_depth=2, min_samples_leaf=20, min_samples_split=20, splitter=best
[CV 2/5; 141/192] END criterion=entropy, max_depth=2, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.695 total time=   0.1s
[CV 3/5; 141/192] START criterion=entropy, max_depth=2, min_samples_leaf=20, min_samples_split=20, splitter=best
[CV 3/5; 141/192] END criterion=entropy, max_depth=2, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.689 total time=   0.1s
[CV 4/5; 141/192] START criterion=entropy, max_depth=2, min_samples_leaf=20, min_samples_split=20, splitter=best
[CV 4/5; 141/192] END criterion=entropy, max_depth=2, min_samples_leaf=20, min_samples_split=20, splitter=best;, score=0.693 total time=   0.1s
[CV 5/5; 141/192] START criterion=entropy, max_depth=2, min_samples_leaf=20, min_samp

[CV 3/5; 147/192] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, splitter=best;, score=0.707 total time=   0.2s
[CV 4/5; 147/192] START criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, splitter=best
[CV 4/5; 147/192] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, splitter=best;, score=0.705 total time=   0.2s
[CV 5/5; 147/192] START criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, splitter=best
[CV 5/5; 147/192] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, splitter=best;, score=0.717 total time=   0.2s
[CV 1/5; 148/192] START criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, splitter=random
[CV 1/5; 148/192] END criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, splitter=random;, score=0.698 total time=   0.2s
[CV 2/5; 148/192] START criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_

[CV 5/5; 153/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=5, splitter=best;, score=0.717 total time=   0.2s
[CV 1/5; 154/192] START criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=5, splitter=random
[CV 1/5; 154/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=5, splitter=random;, score=0.698 total time=   0.2s
[CV 2/5; 154/192] START criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=5, splitter=random
[CV 2/5; 154/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=5, splitter=random;, score=0.704 total time=   0.2s
[CV 3/5; 154/192] START criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=5, splitter=random
[CV 3/5; 154/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=5, splitter=random;, score=0.698 total time=   0.2s
[CV 4/5; 154/192] START criterion=entropy, max_depth=5, min_samples_leaf=10, min

[CV 2/5; 160/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=40, splitter=random;, score=0.704 total time=   0.2s
[CV 3/5; 160/192] START criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=40, splitter=random
[CV 3/5; 160/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=40, splitter=random;, score=0.699 total time=   0.2s
[CV 4/5; 160/192] START criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=40, splitter=random
[CV 4/5; 160/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=40, splitter=random;, score=0.699 total time=   0.2s
[CV 5/5; 160/192] START criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=40, splitter=random
[CV 5/5; 160/192] END criterion=entropy, max_depth=5, min_samples_leaf=10, min_samples_split=40, splitter=random;, score=0.710 total time=   0.2s
[CV 1/5; 161/192] START criterion=entropy, max_depth=5, min_samples_lea

[CV 4/5; 166/192] END criterion=entropy, max_depth=5, min_samples_leaf=20, min_samples_split=20, splitter=random;, score=0.699 total time=   0.2s
[CV 5/5; 166/192] START criterion=entropy, max_depth=5, min_samples_leaf=20, min_samples_split=20, splitter=random
[CV 5/5; 166/192] END criterion=entropy, max_depth=5, min_samples_leaf=20, min_samples_split=20, splitter=random;, score=0.709 total time=   0.2s
[CV 1/5; 167/192] START criterion=entropy, max_depth=5, min_samples_leaf=20, min_samples_split=40, splitter=best
[CV 1/5; 167/192] END criterion=entropy, max_depth=5, min_samples_leaf=20, min_samples_split=40, splitter=best;, score=0.704 total time=   0.2s
[CV 2/5; 167/192] START criterion=entropy, max_depth=5, min_samples_leaf=20, min_samples_split=40, splitter=best
[CV 2/5; 167/192] END criterion=entropy, max_depth=5, min_samples_leaf=20, min_samples_split=40, splitter=best;, score=0.711 total time=   0.2s
[CV 3/5; 167/192] START criterion=entropy, max_depth=5, min_samples_leaf=20, mi

[CV 1/5; 173/192] END criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.743 total time=   0.5s
[CV 2/5; 173/192] START criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=20, splitter=best
[CV 2/5; 173/192] END criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.740 total time=   0.5s
[CV 3/5; 173/192] START criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=20, splitter=best
[CV 3/5; 173/192] END criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.730 total time=   0.5s
[CV 4/5; 173/192] START criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=20, splitter=best
[CV 4/5; 173/192] END criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=20, splitter=best;, score=0.730 total time=   0.5s
[CV 5/5; 173/192] START criterion=entropy, max_depth=10, min_samples_leaf=5, min_samp

[CV 3/5; 179/192] END criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=10, splitter=best;, score=0.729 total time=   0.5s
[CV 4/5; 179/192] START criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=10, splitter=best
[CV 4/5; 179/192] END criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=10, splitter=best;, score=0.732 total time=   0.4s
[CV 5/5; 179/192] START criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=10, splitter=best
[CV 5/5; 179/192] END criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=10, splitter=best;, score=0.741 total time=   0.5s
[CV 1/5; 180/192] START criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=10, splitter=random
[CV 1/5; 180/192] END criterion=entropy, max_depth=10, min_samples_leaf=10, min_samples_split=10, splitter=random;, score=0.730 total time=   0.3s
[CV 2/5; 180/192] START criterion=entropy, max_depth=10, min_samples_leaf=

[CV 5/5; 185/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=5, splitter=best;, score=0.739 total time=   0.5s
[CV 1/5; 186/192] START criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=5, splitter=random
[CV 1/5; 186/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=5, splitter=random;, score=0.730 total time=   0.3s
[CV 2/5; 186/192] START criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=5, splitter=random
[CV 2/5; 186/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=5, splitter=random;, score=0.730 total time=   0.3s
[CV 3/5; 186/192] START criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=5, splitter=random
[CV 3/5; 186/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=5, splitter=random;, score=0.730 total time=   0.3s
[CV 4/5; 186/192] START criterion=entropy, max_depth=10, min_samples_leaf

[CV 2/5; 192/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=40, splitter=random;, score=0.730 total time=   0.3s
[CV 3/5; 192/192] START criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=40, splitter=random
[CV 3/5; 192/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=40, splitter=random;, score=0.731 total time=   0.3s
[CV 4/5; 192/192] START criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=40, splitter=random
[CV 4/5; 192/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=40, splitter=random;, score=0.727 total time=   0.3s
[CV 5/5; 192/192] START criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=40, splitter=random
[CV 5/5; 192/192] END criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=40, splitter=random;, score=0.736 total time=   0.3s


{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 20,
 'splitter': 'best'}

In [41]:
estimator.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 20,
 'splitter': 'best'}

In [52]:
estimator = DecisionTreeClassifier()

In [53]:
estimator.fit(X_train, y_train)

In [54]:
predictions = estimator.predict(X_test)

In [55]:
pd.DataFrame(confusion_matrix(y_test, predictions))

Unnamed: 0,0,1,2
0,404,181,517
1,204,4331,1186
2,454,1192,6381


In [56]:
pd.DataFrame(classification_report(y_test, predictions, output_dict=True))

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.380414,0.759292,0.789337,0.748552,0.643014,0.747416
recall,0.366606,0.757035,0.794942,0.748552,0.639528,0.748552
f1-score,0.373383,0.758162,0.79213,0.748552,0.641225,0.747969
support,1102.0,5721.0,8027.0,0.748552,14850.0,14850.0


In [None]:
estimator = KNeighborsClassifier()

param_grid = {
    'metric': ['euclidean', 'minkowski', 'manhattan'],
    'n_neighbors': [3,4,5,6,7,8,9,10],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    
}

estimator = GridSearchCV(estimator, param_grid, cv=5, verbose=10)
estimator.fit(X_train, y_train)
estimator.best_params_

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5; 1/96] START algorithm=auto, metric=euclidean, n_neighbors=3............
[CV 1/5; 1/96] END algorithm=auto, metric=euclidean, n_neighbors=3;, score=0.768 total time=   1.9s
[CV 2/5; 1/96] START algorithm=auto, metric=euclidean, n_neighbors=3............
[CV 2/5; 1/96] END algorithm=auto, metric=euclidean, n_neighbors=3;, score=0.765 total time=   1.9s
[CV 3/5; 1/96] START algorithm=auto, metric=euclidean, n_neighbors=3............
[CV 3/5; 1/96] END algorithm=auto, metric=euclidean, n_neighbors=3;, score=0.763 total time=   1.9s
[CV 4/5; 1/96] START algorithm=auto, metric=euclidean, n_neighbors=3............
[CV 4/5; 1/96] END algorithm=auto, metric=euclidean, n_neighbors=3;, score=0.754 total time=   1.9s
[CV 5/5; 1/96] START algorithm=auto, metric=euclidean, n_neighbors=3............
[CV 5/5; 1/96] END algorithm=auto, metric=euclidean, n_neighbors=3;, score=0.762 total time=   1.9s
[CV 1/5; 2/96] START algorithm=au

[CV 1/5; 10/96] END algorithm=auto, metric=minkowski, n_neighbors=4;, score=0.758 total time=   2.5s
[CV 2/5; 10/96] START algorithm=auto, metric=minkowski, n_neighbors=4...........
[CV 2/5; 10/96] END algorithm=auto, metric=minkowski, n_neighbors=4;, score=0.757 total time=   2.5s
[CV 3/5; 10/96] START algorithm=auto, metric=minkowski, n_neighbors=4...........
[CV 3/5; 10/96] END algorithm=auto, metric=minkowski, n_neighbors=4;, score=0.759 total time=   3.9s
[CV 4/5; 10/96] START algorithm=auto, metric=minkowski, n_neighbors=4...........
[CV 4/5; 10/96] END algorithm=auto, metric=minkowski, n_neighbors=4;, score=0.744 total time=   4.5s
[CV 5/5; 10/96] START algorithm=auto, metric=minkowski, n_neighbors=4...........
[CV 5/5; 10/96] END algorithm=auto, metric=minkowski, n_neighbors=4;, score=0.760 total time=   2.6s
[CV 1/5; 11/96] START algorithm=auto, metric=minkowski, n_neighbors=5...........
[CV 1/5; 11/96] END algorithm=auto, metric=minkowski, n_neighbors=5;, score=0.767 total ti

[CV 1/5; 19/96] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.770 total time=  18.4s
[CV 2/5; 19/96] START algorithm=auto, metric=manhattan, n_neighbors=5...........
[CV 2/5; 19/96] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.768 total time=  19.0s
[CV 3/5; 19/96] START algorithm=auto, metric=manhattan, n_neighbors=5...........
[CV 3/5; 19/96] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.773 total time=  20.7s
[CV 4/5; 19/96] START algorithm=auto, metric=manhattan, n_neighbors=5...........
[CV 4/5; 19/96] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.760 total time=  19.7s
[CV 5/5; 19/96] START algorithm=auto, metric=manhattan, n_neighbors=5...........
[CV 5/5; 19/96] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.771 total time=  19.1s
[CV 1/5; 20/96] START algorithm=auto, metric=manhattan, n_neighbors=6...........
[CV 1/5; 20/96] END algorithm=auto, metric=manhattan, n_neighbors=6;, score=0.770 total ti