In [19]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
os.listdir('./smoker_set')

['test.csv', 'train.csv', 'sample_submission.csv']

In [8]:
df = pd.read_csv('./smoker_set/train.csv', index_col=0)

In [11]:
df.head()

Unnamed: 0_level_0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,55,165,60,81.0,0.5,0.6,1,1,135,87,...,40,75,16.5,1,1.0,22,25,27,0,1
1,70,165,65,89.0,0.6,0.7,2,2,146,83,...,57,126,16.2,1,1.1,27,23,37,1,0
2,20,170,75,81.0,0.4,0.5,1,1,118,75,...,45,93,17.4,1,0.8,27,31,53,0,1
3,35,180,95,105.0,1.5,1.2,1,1,131,88,...,38,102,15.9,1,1.0,20,27,30,1,0
4,30,165,60,80.5,1.5,1.0,1,1,121,76,...,44,93,15.4,1,0.8,19,13,17,0,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159256 entries, 0 to 159255
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   age                  159256 non-null  int64  
 1   height(cm)           159256 non-null  int64  
 2   weight(kg)           159256 non-null  int64  
 3   waist(cm)            159256 non-null  float64
 4   eyesight(left)       159256 non-null  float64
 5   eyesight(right)      159256 non-null  float64
 6   hearing(left)        159256 non-null  int64  
 7   hearing(right)       159256 non-null  int64  
 8   systolic             159256 non-null  int64  
 9   relaxation           159256 non-null  int64  
 10  fasting blood sugar  159256 non-null  int64  
 11  Cholesterol          159256 non-null  int64  
 12  triglyceride         159256 non-null  int64  
 13  HDL                  159256 non-null  int64  
 14  LDL                  159256 non-null  int64  
 15  hemoglobin           1

In [20]:
trainSet = df[df.columns[:-1]].values
trainSetY = df[df.columns[-1]].values

trainX, valX, trainY, valY = train_test_split(trainSet, trainSetY, test_size=0.10, random_state=42)

In [21]:
rf = RandomForestClassifier(n_estimators=100,
                           max_depth=30,
                           min_samples_split=5,
                           min_samples_leaf=3,
                           bootstrap=False)

In [22]:
rf.fit(trainX, trainY)

In [23]:
prediction = rf.predict(valX)
print(f'Accuracy: {np.where(prediction==valY, 1, 0).sum() / len(valY):.3f}')

Accuracy: 0.773


In [30]:
estimators = np.arange(10, 120, 5)
max_depths = np.arange(3,40, 5)
min_samples_split = np.arange(3, 20, 2)
min_samples_leaf = np.arange(3, 10, 1)

In [32]:
best_model = ...
best_accuracy = 0.
best_states = {}

for estimator in estimators:
    for depth in max_depths:
        for sample_split in min_samples_split:
            for sample_leaf in min_samples_leaf:
                _model = RandomForestClassifier(n_estimators=estimator,
                                                max_depth=depth,
                                                min_samples_split=sample_split,
                                                min_samples_leaf=sample_leaf,
                                                random_state=17,
                                                bootstrap=False)
                _model.fit(trainX, trainY)
                prediction = _model.predict(valX)
                accuracy = np.where(prediction==valY, 1, 0).sum() / len(valY)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_model = _model
                    best_states = {'n_estimators': estimator,
                                    'max_depth': depth,
                                    'min_samples_split': sample_split,
                                    'min_samples_leaf': sample_leaf,
                                    'random_state': 17,
                                    'bootstrap': False}

KeyboardInterrupt: 