In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import folium

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.metrics import plot_confusion_matrix

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from imblearn.over_sampling import SMOTE



In [None]:
df_values = pd.read_csv('data/values.csv')
df_labels = pd.read_csv('data/labels.csv')

df = pd.merge(df_values, df_labels, on='id')


# Business Problem

A nonprofit wants to improve access to water across Tanzania. To do so, the nonprofit wants to be able to send alerts to the entities in charge of an individual to check the status of said well. These alerts will be sent when a well is deemed 'in need of repair' or 'non-functioning'. With this goal in mind, the nonprofit needs an algorithm that can accurately predict when the wells should be checked. 

# SCRUBBING

In [None]:
df.drop('id', axis=1, inplace=True)

In [None]:
df.isna().sum()

#### Null Values

Dropping null values results in the loss of approx 10,000 entries

I will one-hot-encode 'null columns' for the features that contain these null values during preproccessing (Modeling Phase)

#### Dropping Columns

In [None]:
df.drop('scheme_name', axis=1, inplace=True)

In [None]:
df.drop('subvillage', axis=1, inplace=True)

In [None]:
df.drop('waterpoint_type_group', axis=1, inplace=True)

In [None]:
df.drop('quantity_group', axis=1, inplace=True)

In [None]:
df.drop('water_quality', axis=1, inplace=True)

In [None]:
df.drop('payment', axis=1, inplace=True)

1. **Dropped Columns:** 
    - scheme_name (over half of values are null)
    - subvillage (too many unique string values. Better geographical data exists)
    - waterpoint_type_group (redundant)
    - quantity_group (redundant)
    - payment (redundant)
    - water_quality (redundant)

#### Grouping Low-Frequency Values 

In [None]:
df = df.replace({'funder': 'Ministry Of Water'}, 'Government Of Tanzania')

series = pd.value_counts(df['funder'])
mask = (series/series.sum() * 100).lt(1.75)
df['funder'] = np.where(df['funder'].isin(series[mask].index),'Other',df['funder'])

df = df.replace({'funder': '0'}, 'Other')

In [None]:
df['funder'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='funder', hue='status_group', data=df)

In [None]:
# groupby_test = df.groupby('funder').agg('count').reset_index()
# groupby_test.head()
# sns.barplot(x='status_group', y='funder', data=groupby_test[groupby_test['status_group'] > 1000])


I have strong reason to believe that 'Government of Tanzania' and 'Ministry of Water' can be lumped into one: https://en.wikipedia.org/wiki/Ministry_of_Water_and_Irrigation

In [None]:
#aggregating
df = df.replace({'installer': 'hesawa'}, 'HESAWA')
df = df.replace({'installer': 'Central government'}, 'Government')
df = df.replace({'installer': 'Commu'}, 'Community')

#cleaning 
df = df.replace({'installer': 'DANID'}, 'DANIDA' )

In [None]:
series = pd.value_counts(df['installer'])
mask = (series/series.sum() * 100).lt(1.75)
df['installer'] = np.where(df['installer'].isin(series[mask].index),'Other',df['installer'])
df['installer'].value_counts()

df = df.replace({'installer': '0'}, 'Other')

In [None]:
df['installer'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='installer', hue='status_group', data=df)

The following values have been aggregated due to the belief that they were entered with spelling errors

In [None]:
#aggregating
df = df.replace({'wpt_name': 'Shuleni'}, 'School')
df = df.replace({'wpt_name': 'Shule'}, 'School')
df = df.replace({'wpt_name': 'Shule Ya Misingi'}, 'School')

series = pd.value_counts(df['wpt_name'])
mask = (series/series.sum() * 100).lt(1.55)
df['wpt_name'] = np.where(df['wpt_name'].isin(series[mask].index),'Other',df['wpt_name'])

df = df.replace({'wpt_name': 'none'}, 'Other')

In [None]:
df['wpt_name'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='wpt_name', hue='status_group', data=df)

#### construction_year (missing values)

In [None]:
sorted(df['construction_year'].unique())

#### well_age

Before I bin the construction_year feature, I am going to make use of what information is given to calculate the age of each well 

In [None]:
df['well_age'] = 2013 - df['construction_year']
df = df.replace({'well_age': 2013}, 100 )

#### binning construction_year

In [None]:
df[(df['construction_year'] < 1970)]['construction_year'].value_counts().sum() - 20709

In [None]:
df[(df['construction_year'] < 1980) & (df['construction_year'] > 1969)]['construction_year'].value_counts().sum()

In [None]:
df[(df['construction_year'] < 1990) & (df['construction_year'] > 1979)]['construction_year'].value_counts().sum()

In [None]:
df[(df['construction_year'] < 2000) & (df['construction_year'] > 1989)]['construction_year'].value_counts().sum()

In [None]:
df[(df['construction_year'] < 2010) & (df['construction_year'] > 2000)]['construction_year'].value_counts().sum()

In [None]:
df[(df['construction_year'] >= 2010)]['construction_year'].value_counts().sum()

**OBSERVATION**
1. Nearly half of the entries are missing a construction year
2. Construction year goes back consistantly to 1960
    - I will bin construction years and create a 'null' column during preprocessing
    - BINS = 1960 - 1989, 1990-199, 2000-2009, 2010+

In [None]:
# bins=[-1,1959,1989,1999,2009,2014]

# df['year_constructed']= pd.cut(df['construction_year'],
#                               bins=bins,
#                               labels= ['Unknown','60s - 80s', '1990s', '2000s', '2010+'])

# df.drop('construction_year', axis=1, inplace=True)

#### Assessing Geographic Data

- subvillage 
- region 
- region_code 
- district_code
- lga
- ward 

In [None]:
df['region'].value_counts()

In [None]:
df['region_code'].value_counts()

In [None]:
len(df['region'].unique()) - len(df['region_code'].unique())

In [None]:
df['district_code'].value_counts()

In [None]:
len(df['district_code'].unique()) - len(df['region'].unique())

In [None]:
df['lga'].value_counts()

In [None]:
df['ward'].value_counts()

Based upon value counts, it would appear that 'region' will be the cleanest geographical data to work with 

#### Continuous Data

In [None]:
df.describe()

#### Dealing with Population

In [None]:
df[(df['population'] == 0)]

In [None]:
df['population'].value_counts()

Nearly half the data for 'population' is 0 and 1 

My best option is to bin 

In [None]:
#df[(df['population'] < 100) & (df['population'] > 1)]

In [None]:
#df[(df['population'] < 500) & (df['population'] > 1)]

In [None]:
#df[(df['population'] < 200) & (df['population'] > 1)]

In [None]:
#df[(df['population'] > 200) ]

In [None]:
# bins = [-1,1,199,30501]
# df['population_grouped'] = pd.cut(df['population'],
#                                  bins=bins,
#                                  labels=['Unknown', '<200', '>200'])

#### gps_height (Altitude)

In [None]:
df['altitude'] = df['gps_height']
df.drop('gps_height', axis=1, inplace=True)

In [None]:
sns.histplot(df['altitude'])

**OBSERVATION**
1. While 0 is an acceptable value for altitude, I do not believe that there are this many wells at an altitude of 0

#### Month Recorded 

Tanzania experiences its wetest season in March/April, and its driest during June/July/August 

The month the status of the well was recorded could be revealing 

In [None]:
df['month_recorded']=pd.to_datetime(df['date_recorded']).dt.month
df['year_recorded']=pd.to_datetime(df['date_recorded']).dt.year


In [None]:
df.drop('date_recorded', axis=1, inplace=True)

In [None]:
df = df[(df['year_recorded']) > 2005]

## Revisitin Null Values 

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df['source_type'].value_counts()

In [None]:
df['source_class'].value_counts()

In [None]:
df['source'].value_counts()

In [None]:
df.drop('source_type', axis=1, inplace=True)

In [None]:
df['payment_type'].value_counts()

In [None]:
df['management_group'].value_counts()

In [None]:
df['extraction_type'].value_counts()

In [None]:
df['extraction_type_group'].value_counts()

In [None]:
df.drop('extraction_type', axis=1, inplace=True)

In [None]:
df['extraction_type_class'].value_counts()

In [None]:
df['scheme_management'].value_counts()

In [None]:
df['recorded_by'].value_counts()

In [None]:
df.drop('recorded_by', axis=1, inplace=True)

In [None]:
len(df['ward'].unique())

In [None]:
df.drop('ward', axis=1, inplace=True)

In [None]:
df['lga'].value_counts()

In [None]:
df.drop('lga', axis=1, inplace=True)

# EDA

In [None]:
df.describe()

In [None]:
sns.countplot(x='status_group', data=df)

**OBSERVATION**
1. There are far more functional wells than non-functional / needs repair
2. I will need to address the class imbalances during preprocessing / modeling 

In [None]:
sns.scatterplot(x='status_group', y='amount_tsh', data=df)

In [None]:
tsh_df = df

In [None]:
tsh_df = tsh_df[(tsh_df['amount_tsh'] < 150000)]

In [None]:
sns.scatterplot(x='status_group', y='amount_tsh', data=tsh_df)

**OBSERVATION**
1. It would appear that higher amounts of static head correlates with being functional; however it is very likely that this is due to the class imbalance


In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='funder', hue='status_group', data=df)

In [None]:
df.drop('funder', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='installer', hue='status_group', data=df)

In [None]:
df.drop('installer', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='management_group', hue='status_group', data=df)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='management', hue='status_group', data=df)

In [None]:
df.drop('management',axis=1, inplace=True)

In [None]:
sns.scatterplot(x='status_group', y='population', data=df)

In [None]:
df.groupby('status_group').population.median()

**OBSERVATION**
1. small populations may correlate with non-functional wells, as it may be more imperative that resources are spent on more populous wells 
2. functional wells may have a slightly smaller associated population due to less frequent use 

In [None]:
sns.scatterplot(x='status_group', y='altitude', data=df)

In [None]:
age_df = df[(df['well_age'] < 100)]

In [None]:
sns.scatterplot(x='status_group', y='well_age', data=age_df)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='well_age', hue='status_group', data=df)

**OBSERVATION**
1. there are typically more non-functional wells than functional wells for wells 28 years and over 

In [None]:
df[(df['population'] > 20000)]['region']

Tanga is a large port city in the NorthEast of Tanzania, this datapoint could make sense
https://en.wikipedia.org/wiki/Tanga,_Tanzania

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='month_recorded', hue='status_group', data=df)

**OBSERVATION**
1. most of the data collected is collected in the first quarter of the year (Jan - March)
2. May, July, September, and December seem to have very little recordings 
3. June - October are the driest seasons: it would appear that there is a spike in recordings halfway through the dry season

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='year_recorded', hue='status_group', data=df)

In [None]:
df.drop('year_recorded', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='quality_group', hue='status_group', data=df)

In [None]:
quality_df = df[(df['quality_group'] != 'good')]

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='quality_group', hue='status_group', data=quality_df)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='quantity', hue='status_group', data=df)

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='basin', hue='status_group', data=df)

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='scheme_management', hue='status_group', data=df)

In [None]:
plt.figure(figsize=(20,15))
sns.scatterplot(x='longitude', y='latitude', hue='status_group', data=df)

In [None]:
df.describe()

In [None]:
df[df['longitude'] == 0]

In [None]:
df = df[df['longitude'] > 0]

In [None]:
map_df = df

In [None]:
map_df = map_df[(map_df['longitude'] > 0)]

In [None]:
map_df[(map_df['longitude'] == 0)]

In [None]:
plt.figure(figsize=(20,15))
sns.scatterplot(x='longitude', y='latitude', hue='status_group', data=map_df)

In [None]:
plt.figure(figsize=(20,15))
sns.scatterplot(x='longitude', y='latitude', hue='population_grouped', data=map_df)

In [None]:
plt.figure(figsize=(20,15))
sns.scatterplot(x='longitude', y='latitude', hue='region', data=map_df)

In [None]:
plt.figure(figsize=(20,15))
sns.scatterplot(x='longitude', y='latitude', hue='basin', data=map_df)

In [None]:
df

# Modeling 


In [None]:
df = df.replace({'status_group': 'functional'}, 1)
df = df.replace({'status_group': 'non functional'}, 2)
df = df.replace({'status_group': 'functional needs repair'}, 3)

### Logistic Regression - Baseline 

In [None]:
#splitting features and target
#creating dummy variables for categorical data
X = df.drop('status_group', axis=1)
X = pd.get_dummies(X, drop_first=True )

y = df['status_group']

In [None]:
#creating training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state=2)

In [None]:
#scaling 
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# #fitting
# lr_base = LogisticRegression(solver='lbfgs', random_state=2)
# lr_base_model = lr_base.fit(X_train, y_train)

In [None]:
# lr_base = LogisticRegression(solver='newton-cg', random_state=2)
# lr_base_model = lr_base.fit(X_train, y_train)

In [None]:
lr_base = LogisticRegression(solver='sag', random_state=2, max_iter=1000)
lr_base_model = lr_base.fit(X_train_scaled, y_train)

In [None]:
# lr_base = LogisticRegression(solver='saga', random_state=2)
# lr_base_model = lr_base.fit(X_train, y_train)

In [None]:
print(confusion_matrix(y_test, lr_base.predict(X_test_scaled)))

In [None]:
plot_confusion_matrix(lr_base, X_test_scaled, y_test,
                     cmap=plt.cm.Blues)
plt.show()

In [None]:
y_test.value_counts()

In [None]:
print(classification_report(y_test, lr_base.predict(X_test_scaled)))

**OBSERVATION** 
1. As expected, class imbalance has made the baseline model guess functional (1) very frequently, and functional in need of repair (3) very little 

### Logistic Regression - Gridsearch Optimization

In [None]:
sns.countplot(x='status_group', data=df)

In [None]:
y.value_counts()

In [None]:
18017/26127

In [None]:
26127/18017

In [None]:
3160/26127

In [None]:
26127/3160

In [None]:
lr_v2 = LogisticRegression(solver='sag', random_state=2, max_iter=4000)

weight_dict = {2: 1.5, 3:8}

param_grid = {
    'class_weight': [None, 'balanced', weight_dict],
    'C': [1.0, 500, 1e12]
}

lr_v2_model = GridSearchCV(estimator = lr_v2, param_grid=param_grid, cv=2)
lr_v2_model.fit(X_train_scaled, y_train)

In [None]:
lr_v2_model.best_params_

In [None]:
param_grid

## Decision Tree Baseline 

In [None]:
dt_base = DecisionTreeClassifier()

dt_base_model = dt_base.fit(X_train_scaled, y_train)

In [None]:
# fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (3,3), dpi=300)
# tree.plot_tree(dt_base,
#                feature_names = X.columns, 
#                class_names=np.unique(y).astype('str'),
#                filled = True)
# plt.show()

In [None]:
print(classification_report(y_train, dt_base.predict(X_train_scaled)))

**INTERPRETATION**
1. Extremely Overfit. The model knows the training data completely 

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (20,15), dpi=300)
tree.plot_tree(dt_base,
               feature_names = X.columns, 
               class_names=np.unique(y).astype('str'),
               filled = True)
plt.show()

**OBSERVATION**
1. The long tails indicate the overfitting

In [None]:
plot_confusion_matrix(dt_base, X_test_scaled, y_test,
                     cmap=plt.cm.Blues)
plt.show()

**INTERPRETATION**
1. 2/3 is worst case scenario (predicted non functional, but is actually in need of repair)
2. 3/1 is pretty bad (predicted in need of repair, but is actually working) 

In [None]:
print(classification_report(y_test, dt_base.predict(X_test_scaled)))

**INTERPRETATION**
1. comparing this test report to the train report, Accuracy = 0.76 (a drop of 23%) tells me that the model is very overfit 

In [None]:
print(dt_base.tree_.max_depth)

## Decision Tree - Gridsearch

In [None]:
dt_v2 = DecisionTreeClassifier()

weight_dict = {1: 1, 2: 1.5, 3: 8}
weight_dict2 = {1: 0.5, 2: 3, 3: 10}
weight_dict3 = {1: 0.25, 2: 6, 3: 12}

param_grid = {
    'criterion' : ['gini', 'entropy'],
    'class_weight': [weight_dict, weight_dict2, weight_dict3, 'balanced'],
    'max_depth' : [10, 20, 30],
    'min_samples_split' : [1.0, 3, 5],
    'min_samples_leaf' : [10, 15, 30]
}

dt_v2_model = GridSearchCV(estimator = dt_v2, param_grid=param_grid, cv=3)
dt_v2_model.fit(X_train_scaled, y_train)

In [None]:
print(dt_v2_model.best_params_)

In [None]:
print(classification_report(y_train, dt_v2_model.predict(X_train_scaled)))

In [None]:
plot_confusion_matrix(dt_v2_model, X_test_scaled, y_test,
                     cmap=plt.cm.Blues)
plt.show()

In [None]:
print(classification_report(y_test, dt_v2_model.predict(X_test_scaled)))

In [None]:
dt_v3 = DecisionTreeClassifier(criterion= 'entropy')

weight_dict = {1: 1, 2: 1.5, 3: 8}
weight_dict2 = {1: 0.5, 2: 1.5, 3: 8}

param_grid = {
    'class_weight': [weight_dict, weight_dict2],
    'max_depth' : [25, 30, 35],
    'min_samples_split' : [3, 5],
    'min_samples_leaf' : [9, 10, 11]
}

dt_v3_model = GridSearchCV(estimator = dt_v3, param_grid=param_grid, cv=3)
dt_v3_model.fit(X_train_scaled, y_train)

In [None]:
print(dt_v3_model.best_params_)

In [None]:
plot_confusion_matrix(dt_v3_model, X_test_scaled, y_test,
                     cmap=plt.cm.Blues)
plt.show()

In [None]:
print(classification_report(y_test, dt_v3_model.predict(X_test_scaled)))

In [None]:
dt_v4 = DecisionTreeClassifier(criterion= 'entropy', max_depth=30)

weight_dict = {1: 1, 2: 1.5, 3: 8}
weight_dict2 = {1: 1, 2: 3, 3: 10}

param_grid = {
    'class_weight': [weight_dict, weight_dict2],
    'min_samples_split' : [2, 3],
    'min_samples_leaf' : [7, 8, 9]
}

dt_v4_model = GridSearchCV(estimator = dt_v4, param_grid=param_grid, cv=3)
dt_v4_model.fit(X_train_scaled, y_train)

In [None]:
print(dt_v4_model.best_params_)

In [None]:
print(classification_report(y_test, dt_v4_model.predict(X_test_scaled)))

### Decision Tree - SMOTE

Thus far I have onl been dealing with class imbalance by adjusting 'class_weight' in gridsearch. While this is a valid technique, I am seeing little improvement. Using SMOTE to create new synthetic data, rather than simply giving more weight to the minority classes (2 and 3), might yield useful insight.

In [None]:
#creating training and testing data
# X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size= .25, random_state=2)

In [None]:
sm = SMOTE(random_state=2)

X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)


In [None]:
#scaling 
scaler = StandardScaler()

X_train_scaled_sm = scaler.fit_transform(X_train_sm)
X_test_scaled = scaler.transform(X_test)

In [None]:
dt_smote = DecisionTreeClassifier(max_depth=30)
dt_smote_model = dt_smote.fit(X_train_scaled_sm, y_train_sm)

In [None]:
print(classification_report(y_train_sm, dt_smote_model.predict(X_train_scaled_sm)))

In [None]:
print(dt_smote.tree_.max_depth)

In [None]:
print(classification_report(y_test, dt_smote_model.predict(X_test_scaled)))

### Decision Tree - Smote(2)

In [None]:
dt_smote_v2 = DecisionTreeClassifier()


param_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [10, 20, 25],
    'min_samples_split' : [1.0, 10, 20, 30],
    'min_samples_leaf' : [5, 10, 15]
}

dt_smote_v2_model = GridSearchCV(estimator = dt_smote_v2, param_grid=param_grid, cv=3)
dt_smote_v2_model.fit(X_train_scaled_sm, y_train_sm)

In [None]:
print(dt_smote_v2_model.best_params_)

In [None]:
print(classification_report(y_train_sm, dt_smote_v2_model.predict(X_train_scaled_sm)))

In [None]:
print(classification_report(y_test, dt_smote_v2_model.predict(X_test_scaled)))

In [None]:
dt_smote_v4 = DecisionTreeClassifier(max_depth=25, min_samples_leaf=5, min_samples_split=10)

In [None]:
dt_smote_v4.fit(X_train_scaled_sm, y_train_sm)

In [None]:
print(classification_report(y_train_sm, dt_smote_v4.predict(X_train_scaled_sm)))

In [None]:
print(classification_report(y_test, dt_smote_v4.predict(X_test_scaled)))

**INTERPRETATION**
1. 3 is still extremely overfit, but other classes are performing okay (still some overfitting)
2. Looking at overall performance across all classes, this is my best Decision Tree

## Random Forest - Baseline 

In [None]:
forest_base = RandomForestClassifier()
forest_base_model = forest_base.fit(X_train_scaled, y_train)

In [None]:
print(classification_report(y_train, forest_base_model.predict(X_train_scaled)))

In [None]:
print(classification_report(y_test, forest_base_model.predict(X_test_scaled)))

In [None]:
print(forest_base.tree_.max_depth)

**INTERPRETATION**
1. Completely overfit, just like the Decision Tree basemodel 

In [None]:
forest_v2 = RandomForestClassifier(criterion='entropy', 
                                   class_weight= {1: 1, 2: 1.5, 3: 8}, 
                                   max_depth=35, 
                                   min_samples_leaf= 15, 
                                   min_samples_split= 7)

forest_v2_model= forest_v2.fit(X_train_scaled, y_train)

In [None]:
print(classification_report(y_train, forest_v2_model.predict(X_train_scaled)))

In [None]:
print(classification_report(y_test, forest_v2_model.predict(X_test_scaled)))

In [None]:
plot_confusion_matrix(forest_v2_model, X_test_scaled, y_test,
                     cmap=plt.cm.Blues)
plt.show()

In [None]:
forest_v3 = RandomForestClassifier(criterion='entropy', 
                                   class_weight= {1: 1, 2: 1.5, 3: 8}, 
                                   max_depth=35, 
                                   min_samples_leaf= 15, 
                                   min_samples_split= 7,)

param_grid = {
    'max_features': [10, 50, 100],
    'n_estimators': [20, 50, 100]
}

forest_v3_model = GridSearchCV(estimator = forest_v3, param_grid=param_grid, cv=3)
forest_v3_model.fit(X_train_scaled, y_train)

In [None]:
print(forest_v3_model.best_params_)

In [None]:
print(classification_report(y_train, forest_v3_model.predict(X_train_scaled)))

In [None]:
print(classification_report(y_test, forest_v3_model.predict(X_test_scaled)))

In [None]:
plot_confusion_matrix(forest_v3_model, X_test_scaled, y_test,
                     cmap=plt.cm.Blues)
plt.show()