In [1]:
# Import Libraries
import pandas as pd
import numpy as np

# plotting
#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## Model Building

### Training Data

In [2]:
# Load Data:
train = pd.read_pickle("train_clean.pickle")
train.head()

Unnamed: 0,amount_tsh,gps_height,num_private,basin,region,district_code_recoded,population,public_meeting,recorded_by,scheme_management_recoded,...,management_group_recoded,payment_recoded,water_quality_recoded,source_recoded,source_type_recoded,source_class_recoded,waterpoint_type_recoded,waterpoint_type_group_recoded,quantity_recoded,status_group
0,6000.0,1390,0,Lake Nyasa,Iringa,Other Districts,109,True,GeoData Consultants Ltd,VWC,...,user-group,other,soft,spring,spring,groundwater,communal standpipe,communal standpipe,enough,functional
1,0.0,1399,0,Lake Victoria,Mara,District Codes 1-4,280,True,GeoData Consultants Ltd,Other,...,user-group,never pay,soft,other,other,surface,communal standpipe,communal standpipe,insufficient,functional
2,25.0,686,0,Pangani,Manyara,District Codes 1-4,250,True,GeoData Consultants Ltd,VWC,...,user-group,other,soft,other,other,surface,other,communal standpipe,enough,functional
3,0.0,263,0,Ruvuma / Southern Coast,Mtwara,Other Districts,58,True,GeoData Consultants Ltd,VWC,...,user-group,never pay,soft,other,borehole,groundwater,other,communal standpipe,other,non functional
4,0.0,0,0,Lake Victoria,Kagera,District Codes 1-4,0,True,GeoData Consultants Ltd,Other,...,other,never pay,soft,other,other,surface,communal standpipe,communal standpipe,other,functional


In [3]:
# Any missing values?
train.columns[train.isnull().any()]

Index([], dtype='object')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   amount_tsh                     59400 non-null  float64
 1   gps_height                     59400 non-null  int64  
 2   num_private                    59400 non-null  int64  
 3   basin                          59400 non-null  object 
 4   region                         59400 non-null  object 
 5   district_code_recoded          59400 non-null  object 
 6   population                     59400 non-null  int64  
 7   public_meeting                 59400 non-null  bool   
 8   recorded_by                    59400 non-null  object 
 9   scheme_management_recoded      59400 non-null  object 
 10  permit                         59400 non-null  bool   
 11  construction_year              59400 non-null  int64  
 12  extraction_type_recoded        59400 non-null 

In [18]:
# Change object data type into character data type
X_numerical = train.select_dtypes(exclude=['object','bool'])

print("X_numerical - Rows & Columns: ", X_numerical.shape)
X_numerical.head(1)

X_numerical - Rows & Columns:  (59400, 5)


Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year
0,6000.0,1390,0,109,1999


In [47]:
# Change object data type into character data type
X_categorical = train.select_dtypes(include=['object','bool'])
X_categorical = X_categorical.drop('status_group', axis=1)

print("X_categorical - Rows & Columns: ", X_categorical.shape)
X_categorical.head(1)

X_categorical - Rows & Columns:  (59400, 20)


Unnamed: 0,basin,region,district_code_recoded,public_meeting,recorded_by,scheme_management_recoded,permit,extraction_type_recoded,extraction_type_group,extraction_type_class,management_recoded,management_group_recoded,payment_recoded,water_quality_recoded,source_recoded,source_type_recoded,source_class_recoded,waterpoint_type_recoded,waterpoint_type_group_recoded,quantity_recoded
0,Lake Nyasa,Iringa,Other Districts,True,GeoData Consultants Ltd,VWC,False,gravity,gravity,gravity,vwc,user-group,other,soft,spring,spring,groundwater,communal standpipe,communal standpipe,enough


In [20]:
# Change object data type into character data type
Y_categorical = train['status_group']

print("Y_categorical - Rows & Columns: ", Y_categorical.shape)
Y_categorical.head()

Y_categorical - Rows & Columns:  (59400,)


0        functional
1        functional
2        functional
3    non functional
4        functional
Name: status_group, dtype: object

### Option 1: Using Get Dummies

In [67]:
#X_categorical.permit = (X_categorical.permit.astype(str)).str.get_dummies().replace({1:'true', 0:'false'})
#X_categorical.public_meeting = (X_categorical.public_meeting.astype(str)).str.get_dummies().replace({1:'true', 0:'false'})

X_categorical_tf = pd.get_dummies(X_categorical, sparse=True, drop_first = True, prefix="dmy", prefix_sep="_")
X_categorical_tf.head()

Unnamed: 0,public_meeting,permit,dmy_Lake Nyasa,dmy_Lake Rukwa,dmy_Lake Tanganyika,dmy_Lake Victoria,dmy_Pangani,dmy_Rufiji,dmy_Ruvuma / Southern Coast,dmy_Wami / Ruvu,...,dmy_shallow well,dmy_spring,dmy_other,dmy_surface,dmy_hand pump,dmy_other.1,dmy_hand pump.1,dmy_other.2,dmy_insufficient,dmy_other.3
0,True,False,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,True,True,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,True,True,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,True,True,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
4,True,True,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [68]:
X_categorical_tf.shape

(59400, 69)

Some machine learning models have trouble when the variables are of different size (0-100, vs 0-1000000). To deal with that we can scale the data. Here we will use scikit learn's Standard Scaler which removes the mean and scales to unit variance. Here I will create a scaler using all the training numerical fields.

In [69]:
X_numerical.columns

Index(['amount_tsh', 'gps_height', 'num_private', 'population',
       'construction_year'],
      dtype='object')

In [70]:
from sklearn.preprocessing import StandardScaler

scaler  = StandardScaler()
scaler.fit(X_numerical)

StandardScaler(copy=True, with_mean=True, with_std=True)

We will need this scaler for the test data, so let's save it using a package called `pickle`.

In [71]:
import pickle
scalerfile = 'scaler.sav' #'scaler.csv'
pickle.dump(scaler, open(scalerfile, 'wb'))

In [72]:
X_numerical_tf = scaler.transform(X_numerical)

In [73]:
X_numerical_tf.shape

(59400, 5)

In [74]:
X_numerical_tf

array([[ 1.89566509,  1.04125207, -0.03874931, -0.15039928,  0.52666618],
       [-0.10597003,  1.05423701, -0.03874931,  0.21228981,  1.49940173],
       [-0.09762988,  0.02554104, -0.03874931,  0.14866014,  1.41097123],
       ...,
       [-0.10597003, -0.96420011, -0.03874931, -0.38158706, -0.62293038],
       [-0.10597003, -0.96420011, -0.03874931, -0.38158706, -0.62293038],
       [-0.10597003, -0.68863079, -0.03874931, -0.06343874,  0.7919577 ]])

In [75]:
X_numerical_tf = pd.DataFrame(X_numerical_tf, columns=X_numerical.columns)

In [76]:
X_numerical_tf.head()

Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year
0,1.895665,1.041252,-0.038749,-0.150399,0.526666
1,-0.10597,1.054237,-0.038749,0.21229,1.499402
2,-0.09763,0.025541,-0.038749,0.14866,1.410971
3,-0.10597,-0.584751,-0.038749,-0.25857,-0.62293
4,-0.10597,-0.9642,-0.038749,-0.381587,-0.62293


In [77]:
#concat with original data
df_X_transform = pd.concat([X_numerical_tf, X_categorical_tf], axis=1)

In [78]:
df_X_transform.shape

(59400, 74)

In [79]:
df_X_transform.head()

Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year,public_meeting,permit,dmy_Lake Nyasa,dmy_Lake Rukwa,dmy_Lake Tanganyika,...,dmy_shallow well,dmy_spring,dmy_other,dmy_surface,dmy_hand pump,dmy_other.1,dmy_hand pump.1,dmy_other.2,dmy_insufficient,dmy_other.3
0,1.895665,1.041252,-0.038749,-0.150399,0.526666,True,False,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,-0.10597,1.054237,-0.038749,0.21229,1.499402,True,True,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,-0.09763,0.025541,-0.038749,0.14866,1.410971,True,True,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,-0.10597,-0.584751,-0.038749,-0.25857,-0.62293,True,True,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,-0.10597,-0.9642,-0.038749,-0.381587,-0.62293,True,True,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [96]:
# Save the column names
df_X_transform_columns = list(df_X_transform.columns)
df_X_transform_columns

['amount_tsh',
 'gps_height',
 'num_private',
 'population',
 'construction_year',
 'public_meeting',
 'permit',
 'dmy_Lake Nyasa',
 'dmy_Lake Rukwa',
 'dmy_Lake Tanganyika',
 'dmy_Lake Victoria',
 'dmy_Pangani',
 'dmy_Rufiji',
 'dmy_Ruvuma / Southern Coast',
 'dmy_Wami / Ruvu',
 'dmy_Dar es Salaam',
 'dmy_Dodoma',
 'dmy_Iringa',
 'dmy_Kagera',
 'dmy_Kigoma',
 'dmy_Kilimanjaro',
 'dmy_Lindi',
 'dmy_Manyara',
 'dmy_Mara',
 'dmy_Mbeya',
 'dmy_Morogoro',
 'dmy_Mtwara',
 'dmy_Mwanza',
 'dmy_Pwani',
 'dmy_Rukwa',
 'dmy_Ruvuma',
 'dmy_Shinyanga',
 'dmy_Singida',
 'dmy_Tabora',
 'dmy_Tanga',
 'dmy_Other Districts',
 'dmy_VWC',
 'dmy_other',
 'dmy_gravity',
 'dmy_india mark ii',
 'dmy_india mark iii',
 'dmy_mono',
 'dmy_nira/tanira',
 'dmy_other',
 'dmy_other handpump',
 'dmy_other motorpump',
 'dmy_rope pump',
 'dmy_submersible',
 'dmy_swn 80',
 'dmy_wind-powered',
 'dmy_handpump',
 'dmy_motorpump',
 'dmy_other',
 'dmy_rope pump',
 'dmy_submersible',
 'dmy_wind-powered',
 'dmy_vwc',
 'dmy_use

In [99]:
from sklearn.preprocessing import LabelEncoder

# one hot encode target variable
label_encoder = LabelEncoder()
df_y_transform = label_encoder.fit_transform(Y_categorical)

# summarize the transformed data
print('Input All Xs', df_X_transform.shape)
print('')

print('Target Y', df_y_transform.shape)
print(df_y_transform[:,])
print('')

df_Y_categorical['status_group_cat'] = labelencoder.fit_transform(df_Y_categorical['status_group'])
print(df_Y_categorical.groupby(['status_group_cat','status_group'])['status_group'].count())

Input All Xs (59400, 74)

Target Y (59400,)
[0 0 0 ... 0 0 0]

status_group_cat  status_group           
0                 functional                 32259
1                 functional needs repair     4317
2                 non functional             22824
Name: status_group, dtype: int64


### Option 2: Using One-Hot Encoding

In [90]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# one hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
df_X_transform2 = onehot_encoder.fit_transform(X_categorical)

# one hot encode target variable
label_encoder = LabelEncoder()
df_y_transform2 = label_encoder.fit_transform(Y_categorical)

# summarize the transformed data
print('Input Categorical Xs', df_X_transform2.shape)
print(df_X_transform2[:5, :])

print('Target Y', df_y_transform2.shape)
print(df_y_transform2[:,])

Input Categorical Xs (59400, 89)
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1.
  0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 

In [91]:
# Converting the array into a pandas dataframe
# Create a Pandas DataFrame of the hot encoded columns
df_X_transform2_ = pd.DataFrame(df_X_transform2, columns=onehot_encoder.get_feature_names())

#concat with original data
df_X_transform_onehot = pd.concat([X_numerical_tf, df_X_transform2_], axis=1)

In [92]:
df_X_transform2_.shape

(59400, 89)

In [93]:
print('Input All Xs', df_X_transform_onehot.shape)
df_X_transform_onehot.head()

Input All Xs (59400, 94)


Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year,x0_Internal,x0_Lake Nyasa,x0_Lake Rukwa,x0_Lake Tanganyika,x0_Lake Victoria,...,x16_surface,x17_communal standpipe,x17_hand pump,x17_other,x18_communal standpipe,x18_hand pump,x18_other,x19_enough,x19_insufficient,x19_other
0,1.895665,1.041252,-0.038749,-0.150399,0.526666,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.10597,1.054237,-0.038749,0.21229,1.499402,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-0.09763,0.025541,-0.038749,0.14866,1.410971,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,-0.10597,-0.584751,-0.038749,-0.25857,-0.62293,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
4,-0.10597,-0.9642,-0.038749,-0.381587,-0.62293,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [94]:
# summarize the dataset
from collections import Counter

print(df_X_transform_onehot.shape, df_y_transform2.shape)
print(Counter(df_y_transform2))

(59400, 94) (59400,)
Counter({0: 32259, 2: 22824, 1: 4317})


In [95]:
# Save the column names
df_X_transform_columns = list(df_X_transform_onehot.columns)
df_X_transform_columns

['amount_tsh',
 'gps_height',
 'num_private',
 'population',
 'construction_year',
 'x0_Internal',
 'x0_Lake Nyasa',
 'x0_Lake Rukwa',
 'x0_Lake Tanganyika',
 'x0_Lake Victoria',
 'x0_Pangani',
 'x0_Rufiji',
 'x0_Ruvuma / Southern Coast',
 'x0_Wami / Ruvu',
 'x1_Arusha',
 'x1_Dar es Salaam',
 'x1_Dodoma',
 'x1_Iringa',
 'x1_Kagera',
 'x1_Kigoma',
 'x1_Kilimanjaro',
 'x1_Lindi',
 'x1_Manyara',
 'x1_Mara',
 'x1_Mbeya',
 'x1_Morogoro',
 'x1_Mtwara',
 'x1_Mwanza',
 'x1_Pwani',
 'x1_Rukwa',
 'x1_Ruvuma',
 'x1_Shinyanga',
 'x1_Singida',
 'x1_Tabora',
 'x1_Tanga',
 'x2_District Codes 1-4',
 'x2_Other Districts',
 'x3_False',
 'x3_True',
 'x4_GeoData Consultants Ltd',
 'x5_Other',
 'x5_VWC',
 'x6_False',
 'x6_True',
 'x7_gravity',
 'x7_other',
 'x8_afridev',
 'x8_gravity',
 'x8_india mark ii',
 'x8_india mark iii',
 'x8_mono',
 'x8_nira/tanira',
 'x8_other',
 'x8_other handpump',
 'x8_other motorpump',
 'x8_rope pump',
 'x8_submersible',
 'x8_swn 80',
 'x8_wind-powered',
 'x9_gravity',
 'x9_ha

In [100]:
# Assigning numerical values and storing in another column
from sklearn.preprocessing import LabelEncoder

# creating initial dataframe 
df_Y_categorical = pd.DataFrame(Y_categorical, columns=['status_group'])

# creating instance of labelencoder
labelencoder = LabelEncoder()

df_Y_categorical['status_group_cat'] = labelencoder.fit_transform(df_Y_categorical['status_group'])
print(df_Y_categorical.groupby(['status_group_cat','status_group'])['status_group'].count())
print('')

# summarize the dataset
#from collections import Counter

print('Input X: ',df_X_transform_onehot.shape)
print('Target Y: ', df_y_transform2.shape)
#print(Counter(df_y_transform))

status_group_cat  status_group           
0                 functional                 32259
1                 functional needs repair     4317
2                 non functional             22824
Name: status_group, dtype: int64

Input X:  (59400, 94)
Target Y:  (59400,)


### Multinomial Logistic Regression

In [None]:
# evaluate multinomial logistic regression model
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)

# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# we will evaluate the model using repeated k-fold cross-validation, with three repeats and 10 folds.
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, df_X_transform, df_y_transform, scoring='accuracy', cv=cv, n_jobs=-1)

# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# fit the model on the whole dataset
model.fit(df_X_transform, df_y_transform)

# define a single row of input data
# need to recode some variables
row = df_X_transform.head(1)
#row = df_test_clean_transform.head(1)

# predict the class label
yhat = model.predict(row)

# summarize the predicted class
print('Actual Class: %d' % df_y_transform[0])
print('Predicted Class: %d' % yhat[0])

# predict a multinomial probability distribution
yhat = model.predict_proba(row)

# summarize the predicted probabilities
print('Predicted Probabilities: %s' % yhat[0])

In [105]:
# Checking Model Validation
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

print('F1 score: ', f1_score(df_y_transform[0], yhat[0]))
print('Accuracy: ', accuracy_score(df_y_transform[0], yhat[0]))

TypeError: Singleton array 0 cannot be considered a valid collection.

## Random Forest for Classification

In [88]:
# evaluate random forest ensemble for regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestClassifier

# define dataset
#X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=2)

# define the model
model = RandomForestClassifier()

# evaluate the model
#We will evaluate the model using repeated k-fold cross-validation, with three repeats and 10 folds.
cv = RepeatedKFold(n_splits=2, n_repeats=1, random_state=1)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, df_X_transform, df_y_transform, scoring='accuracy', cv=cv, n_jobs=-1)

# report the model performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# fit the model on the whole dataset
model.fit(df_X_transform, df_y_transform)

# define a single row of input data
# need to recode some variables
row = df_X_transform.head(1)
#row = df_test_clean_transform.head(1)

# predict the class label
yhat = model.predict(row)

# summarize the predicted class
print('Actual Class: %d' % df_y_transform[0])
print('Predicted Class: %d' % yhat[0])

# predict a multinomial probability distribution
yhat = model.predict_proba(row)

# summarize the predicted probabilities
print('Predicted Probabilities: %s' % yhat[0])

Accuracy: 0.775 (0.001)
Actual Class: 0
Predicted Class: 0
Predicted Probabilities: [0.98 0.02 0.  ]


## Decision Tree

In [89]:
# evaluate multioutput regression model with k-fold cross-validation
from numpy import absolute
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# create datasets
#X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2, random_state=1, noise=0.5)

# define model
model = DecisionTreeClassifier()

# define the evaluation procedure
#We will evaluate the model using repeated k-fold cross-validation, with three repeats and 10 folds.
cv = RepeatedKFold(n_splits=2, n_repeats=1, random_state=1)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, df_X_transform, df_y_transform, scoring='accuracy', cv=cv, n_jobs=-1)

# force the scores to be positive
n_scores = absolute(n_scores)

# summarize performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# define model
model = DecisionTreeClassifier()

# fit the model on the whole dataset
model.fit(df_X_transform, df_y_transform)

# define a single row of input data
# need to recode some variables
row = df_X_transform.head(1)
#row = df_test_clean_transform.head(1)

# predict the class label
yhat = model.predict(row)

# summarize the predicted class
print('Actual Class: %d' % df_y_transform[0])
print('Predicted Class: %d' % yhat[0])

# predict a multinomial probability distribution
yhat = model.predict_proba(row)

# summarize the predicted probabilities
print('Predicted Probabilities: %s' % yhat[0])

Accuracy: 0.733 (0.002)
Actual Class: 0
Predicted Class: 0
Predicted Probabilities: [1. 0. 0.]


## References:

1. Get Dummies: https://www.dataindependent.com/pandas/pandas-get-dummies/
2. One-Hot Encoding: https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/
3. One-Hot Encoding on Categorical Data: https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd
4. Multinomial Logistic Regression: https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
5. Random Forest: https://machinelearningmastery.com/random-forest-ensemble-in-python/
6. Decision Tree: https://machinelearningmastery.com/multi-output-regression-models-with-python/
