In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
ROOT_PATH = '../../data/'
SEED = 42
TARGET_COL = "Outcome"

In [None]:
train_path = ROOT_PATH + 'diabetes_train.csv'
test_path = ROOT_PATH + 'diabetes_test.csv'

train_set = pd.read_csv(train_path)
test_set = pd.read_csv(test_path)

In [None]:
X_train = train_set.drop([TARGET_COL], axis = 1)
y_train = train_set[TARGET_COL]

X_test = test_set.drop([TARGET_COL], axis = 1)
y_test = test_set[TARGET_COL]

In [None]:
X_train.head()

In [None]:
scaler = StandardScaler()

features_names = X_train.columns

X_train[features_names] = scaler.fit_transform(X_train)
X_test[features_names] = scaler.transform(X_test)

In [None]:
X_train.head()

In [None]:
rf = RandomForestClassifier(random_state = SEED).fit(X_train, y_train)

In [None]:
y_preds = rf.predict(X_test)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
feature_imp = pd.Series(rf.feature_importances_, index = X_train.columns).sort_values(ascending = False)
feature_imp

In [None]:
rf = RandomForestClassifier(random_state = SEED)

parameters = {'n_estimators': [10, 100, 300]}

clf = GridSearchCV(rf, parameters, cv = 5).fit(X_train, y_train)

In [None]:
tunned_rf = clf.best_estimator_

In [None]:
tunned_rf.get_params()

In [None]:
y_preds = tunned_rf.predict(X_test)

In [None]:
accuracy_score(y_test, y_preds)