In [1]:
# Import Libraries
import pandas as pd
import numpy as np

# plotting
#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

### Test Data

In [2]:
#Test Data
df_test = pd.read_csv("PumpItUp/raw_data/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv") 
df_test.head(1)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other


In [3]:
# Are there any columns with missing values?
missing_fields = df_test.columns[df_test.isnull().any()]
missing_fields

Index(['funder', 'installer', 'subvillage', 'public_meeting',
       'scheme_management', 'scheme_name', 'permit'],
      dtype='object')

In [4]:
construction_year_check = df_test[df_test['construction_year'] == 0]
construction_year_check.shape

(5260, 40)

In [5]:
print("Min value: ", df_test['construction_year'].min())
print("Max value: ", df_test['construction_year'].max())
print("Mean value: ", df_test['construction_year'].mean())
print("Median value: ", df_test['construction_year'].median())
print("Mode value: ", df_test['construction_year'].mode())
print("Std value: ", df_test['construction_year'].std())

Min value:  0
Max value:  2013
Mean value:  1289.70835016835
Median value:  1986.0
Mode value:  0    0
dtype: int64
Std value:  955.2410869700427


In [6]:
df_test['construction_year']=df_test['construction_year'].replace(0,df_test['construction_year'].median())

In [7]:
def impute_missing(df):
    
    df['public_meeting'] = df['public_meeting'].fillna(True)
    df['permit'] = df['permit'].fillna(True)
    
    return df

In [8]:
df_test = impute_missing(df_test)

In [9]:
# Recoding some of the categorical fields

def recode(df):
    # Regrouping specific independent variables
    df['district_code_recoded'] = np.where(df['district_code'] <= 4, 'District Codes 1-4', 'Other Districts')

    df['scheme_management_recoded'] = np.where(df['scheme_management'] == 'VWC', 'VWC', 'Other')

    df['extraction_type_recoded'] = np.where(df['extraction_type'] == 'gravity', 'gravity', 'other')
    
    df['management_recoded'] = np.where(df['management'] == 'vwc', 'vwc', 'other')
    df['management_group_recoded'] = np.where(df['management_group'] == 'user-group', 'user-group', 'other')

    df['payment_recoded'] = np.where(df['payment'] == 'never pay', 'never pay', 'other') #repetition ??
    df['payment_type_recoded'] = np.where(df['payment_type'] == 'never pay', 'never pay', 'other') #repetition ??

    df['water_quality_recoded'] = np.where(df['water_quality'] == 'soft', 'soft', 'other') #repetition ??
    df['quality_group_recoded'] = np.where(df['quality_group'] == 'good', 'good', 'other') #repetition ??

    df['source_recoded'] = np.where(~df['source'].isin(['shallow well','spring']), 'other', df['source']) #repetition ??
    df['source_type_recoded'] = np.where(~df['source_type'].isin(['shallow well','spring','borehole','river/lake']), 'other', df['source_type']) #repetition ??
    df['source_class_recoded'] = np.where(~df['source_class'].isin(['groundwater','surface']), 'other', df['source_class'])

    df['waterpoint_type_recoded'] = np.where(~df['waterpoint_type'].isin(['communal standpipe','hand pump']), 'other', df['waterpoint_type']) #repetition ??
    df['waterpoint_type_group_recoded'] = np.where(~df['waterpoint_type_group'].isin(['communal standpipe','hand pump']), 'other', df['waterpoint_type_group']) #repetition ??

    df['quantity_recoded'] = np.where(~df['quantity'].isin(['enough','insufficient']), 'other', df['quantity']) #repetition ??
    df['quantity_group_recoded'] = np.where(~df['quantity_group'].isin(['enough','insufficient']), 'other', df['quantity_group']) #repetition ??

    return df

In [10]:
# Run the function above
df_test=recode(df_test)

In [11]:
# Rearrange the columns prior to saving it
columnsTitles = ['amount_tsh','gps_height','num_private','basin','region',
           'district_code_recoded','population','public_meeting','recorded_by',
           'scheme_management_recoded','permit','construction_year',
           'extraction_type_recoded','extraction_type_group','extraction_type_class',
           'management_recoded','management_group_recoded',
           'payment_recoded',
           'water_quality_recoded',
           'source_recoded','source_type_recoded','source_class_recoded',
           'waterpoint_type_recoded','waterpoint_type_group_recoded',
           'quantity_recoded']

df_test_clean = df_test.reindex(columns=columnsTitles)

In [12]:
#Save Clean dataset as a pickle >> this reduces any issues that normally arise with csv files!
df_test_clean.to_pickle('test_clean.pickle')

In [13]:
# Load Test Data:
df_test_clean = pd.read_pickle("test_clean.pickle")
df_test_clean.head()

Unnamed: 0,amount_tsh,gps_height,num_private,basin,region,district_code_recoded,population,public_meeting,recorded_by,scheme_management_recoded,...,management_recoded,management_group_recoded,payment_recoded,water_quality_recoded,source_recoded,source_type_recoded,source_class_recoded,waterpoint_type_recoded,waterpoint_type_group_recoded,quantity_recoded
0,0.0,1996,0,Internal,Manyara,District Codes 1-4,321,True,GeoData Consultants Ltd,Other,...,other,other,never pay,soft,other,other,surface,other,other,other
1,0.0,1569,0,Pangani,Arusha,District Codes 1-4,300,True,GeoData Consultants Ltd,VWC,...,vwc,user-group,never pay,soft,spring,spring,groundwater,communal standpipe,communal standpipe,insufficient
2,0.0,1567,0,Internal,Singida,District Codes 1-4,500,True,GeoData Consultants Ltd,VWC,...,vwc,user-group,never pay,soft,other,other,surface,other,other,insufficient
3,0.0,267,0,Ruvuma / Southern Coast,Lindi,Other Districts,250,True,GeoData Consultants Ltd,VWC,...,vwc,user-group,other,soft,shallow well,shallow well,groundwater,other,other,other
4,500.0,1260,0,Ruvuma / Southern Coast,Ruvuma,District Codes 1-4,60,True,GeoData Consultants Ltd,Other,...,other,user-group,other,soft,spring,spring,groundwater,communal standpipe,communal standpipe,enough


## Model Building - using One-Hot Encoding

### Training Data

In [14]:
# Load Data:
df = pd.read_pickle("train_clean.pickle")
df.head()

Unnamed: 0,amount_tsh,gps_height,num_private,basin,region,district_code_recoded,population,public_meeting,recorded_by,scheme_management_recoded,...,management_group_recoded,payment_recoded,water_quality_recoded,source_recoded,source_type_recoded,source_class_recoded,waterpoint_type_recoded,waterpoint_type_group_recoded,quantity_recoded,status_group
0,6000.0,1390,0,Lake Nyasa,Iringa,Other Districts,109,True,GeoData Consultants Ltd,VWC,...,user-group,other,soft,spring,spring,groundwater,communal standpipe,communal standpipe,enough,functional
1,0.0,1399,0,Lake Victoria,Mara,District Codes 1-4,280,True,GeoData Consultants Ltd,Other,...,user-group,never pay,soft,other,other,surface,communal standpipe,communal standpipe,insufficient,functional
2,25.0,686,0,Pangani,Manyara,District Codes 1-4,250,True,GeoData Consultants Ltd,VWC,...,user-group,other,soft,other,other,surface,other,communal standpipe,enough,functional
3,0.0,263,0,Ruvuma / Southern Coast,Mtwara,Other Districts,58,True,GeoData Consultants Ltd,VWC,...,user-group,never pay,soft,other,borehole,groundwater,other,communal standpipe,other,non functional
4,0.0,0,0,Lake Victoria,Kagera,District Codes 1-4,0,True,GeoData Consultants Ltd,Other,...,other,never pay,soft,other,other,surface,communal standpipe,communal standpipe,other,functional


In [15]:
#df = df.drop('Unnamed: 0',axis=1)
#df.head()

In [16]:
# Any missing values?
df.columns[df.isnull().any()]

Index([], dtype='object')

In [17]:
# Change object data type into character data type
cols_num = ['amount_tsh','gps_height','num_private','population','construction_year']

cols_cat = ['basin', 'region', 'district_code_recoded','public_meeting','recorded_by',
               'scheme_management_recoded','permit','extraction_type_recoded','extraction_type_group',
               'extraction_type_class','management_recoded','management_group_recoded',
               'payment_recoded','water_quality_recoded','source_recoded','source_type_recoded',
               'source_class_recoded','waterpoint_type_recoded','waterpoint_type_group_recoded',
               'quantity_recoded','status_group']

df[cols_cat] = df[cols_cat].astype('str')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   amount_tsh                     59400 non-null  float64
 1   gps_height                     59400 non-null  int64  
 2   num_private                    59400 non-null  int64  
 3   basin                          59400 non-null  object 
 4   region                         59400 non-null  object 
 5   district_code_recoded          59400 non-null  object 
 6   population                     59400 non-null  int64  
 7   public_meeting                 59400 non-null  object 
 8   recorded_by                    59400 non-null  object 
 9   scheme_management_recoded      59400 non-null  object 
 10  permit                         59400 non-null  object 
 11  construction_year              59400 non-null  int64  
 12  extraction_type_recoded        59400 non-null 

In [18]:
df[cols_cat].head()

Unnamed: 0,basin,region,district_code_recoded,public_meeting,recorded_by,scheme_management_recoded,permit,extraction_type_recoded,extraction_type_group,extraction_type_class,...,management_group_recoded,payment_recoded,water_quality_recoded,source_recoded,source_type_recoded,source_class_recoded,waterpoint_type_recoded,waterpoint_type_group_recoded,quantity_recoded,status_group
0,Lake Nyasa,Iringa,Other Districts,True,GeoData Consultants Ltd,VWC,False,gravity,gravity,gravity,...,user-group,other,soft,spring,spring,groundwater,communal standpipe,communal standpipe,enough,functional
1,Lake Victoria,Mara,District Codes 1-4,True,GeoData Consultants Ltd,Other,True,gravity,gravity,gravity,...,user-group,never pay,soft,other,other,surface,communal standpipe,communal standpipe,insufficient,functional
2,Pangani,Manyara,District Codes 1-4,True,GeoData Consultants Ltd,VWC,True,gravity,gravity,gravity,...,user-group,other,soft,other,other,surface,other,communal standpipe,enough,functional
3,Ruvuma / Southern Coast,Mtwara,Other Districts,True,GeoData Consultants Ltd,VWC,True,other,submersible,submersible,...,user-group,never pay,soft,other,borehole,groundwater,other,communal standpipe,other,non functional
4,Lake Victoria,Kagera,District Codes 1-4,True,GeoData Consultants Ltd,Other,True,gravity,gravity,gravity,...,other,never pay,soft,other,other,surface,communal standpipe,communal standpipe,other,functional


In [19]:
df_X_transform1 = df[cols_num]

print("Rows & Columns: ", df_X_transform1.shape)

df_X_transform1.head()

Rows & Columns:  (59400, 5)


Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year
0,6000.0,1390,0,109,1999
1,0.0,1399,0,280,2010
2,25.0,686,0,250,2009
3,0.0,263,0,58,1986
4,0.0,0,0,0,1986


In [20]:
df_X_transform1.describe()

Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0
mean,317.650385,668.297239,0.474141,179.909983,1993.044293
std,2997.574558,693.11635,12.23623,471.482176,11.30841
min,0.0,-90.0,0.0,0.0,1960.0
25%,0.0,0.0,0.0,0.0,1986.0
50%,0.0,369.0,0.0,25.0,1986.0
75%,20.0,1319.25,0.0,215.0,2004.0
max,350000.0,2770.0,1776.0,30500.0,2013.0


In [21]:
df_X_transform2 = df[cols_cat]
df_X_transform2 = df_X_transform2.drop('status_group', axis=1)

print("Rows & Columns: ", df_X_transform2.shape)

df_X_transform2.head()

Rows & Columns:  (59400, 20)


Unnamed: 0,basin,region,district_code_recoded,public_meeting,recorded_by,scheme_management_recoded,permit,extraction_type_recoded,extraction_type_group,extraction_type_class,management_recoded,management_group_recoded,payment_recoded,water_quality_recoded,source_recoded,source_type_recoded,source_class_recoded,waterpoint_type_recoded,waterpoint_type_group_recoded,quantity_recoded
0,Lake Nyasa,Iringa,Other Districts,True,GeoData Consultants Ltd,VWC,False,gravity,gravity,gravity,vwc,user-group,other,soft,spring,spring,groundwater,communal standpipe,communal standpipe,enough
1,Lake Victoria,Mara,District Codes 1-4,True,GeoData Consultants Ltd,Other,True,gravity,gravity,gravity,other,user-group,never pay,soft,other,other,surface,communal standpipe,communal standpipe,insufficient
2,Pangani,Manyara,District Codes 1-4,True,GeoData Consultants Ltd,VWC,True,gravity,gravity,gravity,vwc,user-group,other,soft,other,other,surface,other,communal standpipe,enough
3,Ruvuma / Southern Coast,Mtwara,Other Districts,True,GeoData Consultants Ltd,VWC,True,other,submersible,submersible,vwc,user-group,never pay,soft,other,borehole,groundwater,other,communal standpipe,other
4,Lake Victoria,Kagera,District Codes 1-4,True,GeoData Consultants Ltd,Other,True,gravity,gravity,gravity,other,other,never pay,soft,other,other,surface,communal standpipe,communal standpipe,other


In [22]:
df_y_transform = df[cols_cat]
df_y_transform = df_y_transform['status_group']

print("Rows & Columns: ", df_y_transform.shape)

df_y_transform.head()

Rows & Columns:  (59400,)


0        functional
1        functional
2        functional
3    non functional
4        functional
Name: status_group, dtype: object

In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# one hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
df_X_transform2 = onehot_encoder.fit_transform(df_X_transform2)

# one hot encode target variable
label_encoder = LabelEncoder()
df_y_transform = label_encoder.fit_transform(df_y_transform)

# summarize the transformed data
print('Input Categorical Xs', df_X_transform2.shape)
print(df_X_transform2[:5, :])

print('Target Y', df_y_transform.shape)
print(df_y_transform[:,])

Input Categorical Xs (59400, 89)
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1.
  0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 

In [24]:
df_X_transform1.shape

(59400, 5)

In [25]:
df_X_transform2.shape

(59400, 89)

In [26]:
# Converting the array into a pandas dataframe
# Create a Pandas DataFrame of the hot encoded columns
df_X_transform2_ = pd.DataFrame(df_X_transform2, columns=onehot_encoder.get_feature_names())

#concat with original data
df_X_transform = pd.concat([df_X_transform1, df_X_transform2_], axis=1)

In [27]:
df_X_transform2_.shape

(59400, 89)

In [28]:
print('Input All Xs', df_X_transform.shape)
df_X_transform.head()

Input All Xs (59400, 94)


Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year,x0_Internal,x0_Lake Nyasa,x0_Lake Rukwa,x0_Lake Tanganyika,x0_Lake Victoria,...,x16_surface,x17_communal standpipe,x17_hand pump,x17_other,x18_communal standpipe,x18_hand pump,x18_other,x19_enough,x19_insufficient,x19_other
0,6000.0,1390,0,109,1999,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1399,0,280,2010,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,25.0,686,0,250,2009,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,263,0,58,1986,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0,0,0,1986,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [29]:
# summarize the dataset
from collections import Counter

print(df_X_transform.shape, df_y_transform.shape)
print(Counter(df_y_transform))

(59400, 94) (59400,)
Counter({0: 32259, 2: 22824, 1: 4317})


In [30]:
# Assigning numerical values and storing in another column
from sklearn.preprocessing import LabelEncoder

# creating initial dataframe
y_cat = df['status_group'] 
df_y_cat = pd.DataFrame(y_cat, columns=['status_group'])

# creating instance of labelencoder
labelencoder = LabelEncoder()

df_y_cat['status_group_cat'] = labelencoder.fit_transform(df_y_cat['status_group'])
print(df_y_cat.groupby(['status_group_cat','status_group'])['status_group'].count())
print('')

# summarize the dataset
#from collections import Counter

print('Input X: ',df_X_transform.shape)
print('Target Y: ', df_y_transform.shape)
#print(Counter(df_y_transform))

status_group_cat  status_group           
0                 functional                 32259
1                 functional needs repair     4317
2                 non functional             22824
Name: status_group, dtype: int64

Input X:  (59400, 94)
Target Y:  (59400,)


### Test Data - Transformed

In [31]:
# Transforming the test data...

# Separate the Numerical & Categorical fields
cols_num = ['amount_tsh','gps_height','num_private','population','construction_year']

cols_cat = ['basin', 'region', 'district_code_recoded','public_meeting','recorded_by',
               'scheme_management_recoded','permit','extraction_type_recoded','extraction_type_group',
               'extraction_type_class','management_recoded','management_group_recoded',
               'payment_recoded','water_quality_recoded','source_recoded','source_type_recoded',
               'source_class_recoded','waterpoint_type_recoded','waterpoint_type_group_recoded',
               'quantity_recoded']

df_test_clean1 = df_test_clean[cols_num]
df_test_clean2 = df_test_clean[cols_cat]

In [32]:
from sklearn.preprocessing import OneHotEncoder

# one hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
df_test_clean2 = onehot_encoder.fit_transform(df_test_clean2)

In [33]:
# Converting the array into a pandas dataframe
# Create a Pandas DataFrame of the hot encoded columns
df_test_clean2_ = pd.DataFrame(df_test_clean2, columns=onehot_encoder.get_feature_names())
#concat with original data
df_test_clean_transform = pd.concat([df_test_clean1, df_test_clean2_], axis=1)

print('Input All Test Xs', df_test_clean_transform.shape)
df_test_clean_transform.head()

Input All Test Xs (14850, 94)


Unnamed: 0,amount_tsh,gps_height,num_private,population,construction_year,x0_Internal,x0_Lake Nyasa,x0_Lake Rukwa,x0_Lake Tanganyika,x0_Lake Victoria,...,x16_surface,x17_communal standpipe,x17_hand pump,x17_other,x18_communal standpipe,x18_hand pump,x18_other,x19_enough,x19_insufficient,x19_other
0,0.0,1996,0,321,2012,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,1569,0,300,2000,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1567,0,500,2010,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,267,0,250,1987,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,500.0,1260,0,60,2000,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


### Multinomial Logistic Regression

In [33]:
# evaluate multinomial logistic regression model
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)

# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# we will evaluate the model using repeated k-fold cross-validation, with three repeats and 10 folds.
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, df_X_transform, df_y_transform, scoring='accuracy', cv=cv, n_jobs=-1)

# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Mean Accuracy: 0.669 (0.007)


In [34]:
# make a prediction with a multinomial logistic regression model
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)

# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# fit the model on the whole dataset
model.fit(df_X_transform, df_y_transform)

# define a single row of input data
# need to recode some variables
row = df_test_clean_transform.head(1)

# predict the class label
yhat = model.predict(row)

# summarize the predicted class
print('Predicted Class: %d' % yhat[0])

# predict a multinomial probability distribution
yhat = model.predict_proba(row)

# summarize the predicted probabilities
print('Predicted Probabilities: %s' % yhat[0])

Predicted Class: 2
Predicted Probabilities: [0.07352647 0.02740003 0.8990735 ]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Random Forest for Regression

We will evaluate the model using repeated k-fold cross-validation, with three repeats and 10 folds. We will report the **mean absolute error (MAE)** of the model across all repeats and folds. The scikit-learn library makes the MAE negative so that it is maximized instead of minimized. This means that larger negative MAE are better and a perfect model has a MAE of 0.

In [35]:
# evaluate random forest ensemble for regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor

# define dataset
#X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=2)

# define the model
model = RandomForestRegressor()

# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, df_X_transform, df_y_transform, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -0.453 (0.006)


In [40]:
# random forest for making predictions for regression
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor

# define dataset
#X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=2)

# define the model
model = RandomForestRegressor()

# fit the model on the whole dataset
model.fit(df_X_transform, df_y_transform)

# make a single prediction
row = df_test_clean_transform.head(1)

yhat = model.predict(row)
print('Prediction: %d' % yhat[0])

Prediction: 0


## Decision Tree

In [41]:
# evaluate multioutput regression model with k-fold cross-validation
from numpy import absolute
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# create datasets
#X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2, random_state=1, noise=0.5)

# define model
model = DecisionTreeRegressor()

# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, df_X_transform, df_y_transform, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# force the scores to be positive
n_scores = absolute(n_scores)

# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 0.447 (0.008)


In [43]:
# decision tree for multioutput regression
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor

# create datasets
#X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2, random_state=1, noise=0.5)

# define model
model = DecisionTreeRegressor()

# fit model
model.fit(df_X_transform, df_y_transform)

# make a prediction
row = df_test_clean_transform.head(1)

yhat = model.predict(row)

# summarize prediction
print(yhat[0])

0.0


## References:

1. One-Hot Encoding: https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/
2. One-Hot Encoding on Categorical Data: https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd
3. Multinomial Logistic Regression: https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
4. Random Forest: https://machinelearningmastery.com/random-forest-ensemble-in-python/
5. Decision Tree: https://machinelearningmastery.com/multi-output-regression-models-with-python/
