# Setup and load preprocessed data

In [None]:
option="Random Forest" #@param['XGBoost','Random Forest','Ada Boost','Logistic Regression']

In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
plt.rcParams["figure.figsize"] = (20,10)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_path = '/content/drive/My Drive/ML/'
train_features_file = base_path + 'preprocessed_train.csv'
train_labels_file = base_path + 'train_set_labels.csv'
test_features_file = base_path + 'preprocessed_test.csv'

In [None]:
train_df = pd.read_csv(train_features_file)
labels_df = pd.read_csv(train_labels_file)
test_df = pd.read_csv(test_features_file)
results_df = pd.DataFrame()
results_df['id'] = test_df['id']
labels = labels_df['status_group']
test_df=test_df.drop(['id'],axis=1)

In [None]:
labels_df

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional
...,...,...
59395,60739,functional
59396,27263,functional
59397,37057,functional
59398,31282,functional


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Columns: 103 entries, amount_tsh to waterpoint_type_other
dtypes: bool(2), float64(5), int64(96)
memory usage: 45.9 MB


# Model Creation

In [None]:
rnd = random.randint(1, 32000) 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

def createModel(option):
  if option=='Logistic Regression':
    model = LogisticRegression(multi_class='multinomial')
  if option=='Random Forest':
    model = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=6, min_samples_leaf=1, 
                               max_features='auto', min_impurity_decrease=0.0, min_impurity_split=None, 
                               bootstrap=True, warm_start=True)
  if option=='Ada Boost':
    model = AdaBoostClassifier( n_estimators=50, learning_rate=1.0, random_state=None)
  if option=='XGBoost':
    model = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bytree=0.4, gamma=0.0,
                importance_type='gain', learning_rate=0.05,max_depth=3, min_child_weight=7, 
                n_estimators=100,n_jobs=1,num_class=3, objective='multi:softmax',
                random_state=0, reg_lambda=1, scale_pos_weight=1,subsample=1, verbosity=1)
  return model

# Train sample model

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(train_df,labels,test_size=0.2,shuffle=True)

In [None]:
model=createModel(option)
model.fit(X_train, y_train)
predictions = model.predict(X_eval)
f1_score(y_eval, predictions, average=None)

array([0.84749232, 0.42773723, 0.80693297])

# Train final model

In [None]:
model=createModel(option)
model.fit(train_df, labels)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True)

In [None]:
final_set = model.predict(test_df)

In [None]:
final_set

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [None]:
results_df['status_group'] = pd.DataFrame(final_set)
results_df.to_csv('results.csv',index=False)

# PDP

In [None]:
from sklearn.inspection import plot_partial_dependence
from sklearn.datasets import make_friedman1
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
from tensorflow.keras.utils import to_categorical
labels_df['status_group'].replace({'functional': 2, 'functional needs repair':1, 'non functional': 0}, inplace=True)
labels = labels_df['status_group']

In [None]:
labels = to_categorical(labels, 3)

In [None]:
labels.shape

(59400, 3)

In [None]:
est1 = LinearRegression().fit(train_df, labels)
est2 = RandomForestRegressor().fit(train_df, labels)
disp1 = plot_partial_dependence(est1, train_df,[1, 5])  
disp2 = plot_partial_dependence(est2, train_df, [1, 5],ax=disp1.axes_)  