In [399]:
#import relevant libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs                          
from catboost import CatBoostClassifier
from sklearn.preprocessing import RobustScaler
from category_encoders import WOEEncoder
import category_encoders as ce
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [400]:
#load data to dataframes
df_train_values=pd.read_csv("Data/train_set_values.csv") 
df_train_labels=pd.read_csv("Data/train_set_labels.csv") 
df_test_values=pd.read_csv("data/test_set_values.csv")

In [401]:
#concatinating train values and labels
df_train_values.set_index('id',inplace=True)
df_train_labels.set_index('id',inplace=True)
df_train = pd.merge(df_train_labels, df_train_values, how = 'inner', left_index = True,right_index=True)
df_train.reset_index(inplace=True)

In [402]:
#concatinating train and test values
df=pd.concat([df_train, df_test_values]).reset_index(drop=True)

In [403]:
df

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74245,39307,,0.0,2011-02-24,Danida,34,Da,38.852669,-6.582841,Kwambwezi,...,never pay,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe
74246,18990,,1000.0,2011-03-21,Hiap,0,HIAP,37.451633,-5.350428,Bonde La Mkondoa,...,annually,salty,salty,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump
74247,28749,,0.0,2013-03-04,,1476,,34.739804,-4.585587,Bwawani,...,never pay,soft,good,insufficient,insufficient,dam,dam,surface,communal standpipe,communal standpipe
74248,33492,,0.0,2013-02-18,Germany,998,DWE,35.432732,-10.584159,Kwa John,...,never pay,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe


In [404]:
#selected features  after data preprocessing & feature selection
features = ['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer',
       'longitude', 'latitude', 'basin', 'region',
       'region_code', 'district_code', 'lga', 'ward', 'population',
       'public_meeting', 'scheme_management', 'permit',
       'construction_year', 'extraction_type_group', 'management', 'payment', 'water_quality', 'quantity', 'source',
       'waterpoint_type']

In [405]:
df=df[features2]

In [406]:
df.isna().sum().sort_values(ascending=False)

installer                4532
funder                   4504
public_meeting           4155
permit                   3793
amount_tsh                  0
source                      0
quantity                    0
water_quality               0
payment                     0
management                  0
extraction_type_group       0
construction_year           0
population                  0
ward                        0
lga                         0
district_code               0
region                      0
basin                       0
latitude                    0
longitude                   0
gps_height                  0
waterpoint_type             0
dtype: int64

In [407]:
#get numerical columns
numerical_cols = list(df.select_dtypes('number').columns)
numerical_cols

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'district_code',
 'population',
 'construction_year']

In [408]:
#get categorical columns
categorical_cols = list(df.select_dtypes('object').columns)
categorical_cols

['funder',
 'installer',
 'basin',
 'region',
 'lga',
 'ward',
 'public_meeting',
 'permit',
 'extraction_type_group',
 'management',
 'payment',
 'water_quality',
 'quantity',
 'source',
 'waterpoint_type']

In [409]:
#fill missing values in the permit column
df['permit'].fillna(value=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [410]:
#fill missing values in the public meeting column
df['public_meeting'].fillna(value=True,inplace=True)

In [411]:
#fill missing values in the numerical columns using mean values
for col in numerical_cols:
    df[col].fillna(df[col].mean(), inplace=True)

In [412]:
#fill missing values in the categorical columns by using Unknown
for col in (categorical_cols):
    df[col].fillna(value='Unknown', inplace=True)

In [413]:
# Replacing spelling mistakes
df['installer'].replace(to_replace = ('Unisef','UNICEF'),value ='Unicef' , inplace=True)
df['installer'].replace(to_replace = 'DANID', value ='DANIDA' , inplace=True)
df['installer'].replace(to_replace = ( 'villager', 'Villagers','villigers', 'Villa', 'Village', 'Villi', 
                                      'Village Council','Village Counil',  'Vill','Villages', 'Village community', 
                                      'Villaers', 'Village Community', 'Villag','Villege Council', 'Village council',
                                      'Village Technician','Village  Council','Villagerd', 'Villager',
                                      'Village Office','Village community members'),
                                        value ='villagers' , inplace=True)
df['installer'].replace(to_replace =('COMMU','Commu','Communit','commu', 'COMMUNITY') ,
                                        value ='Community' , inplace=True)
df['installer'].replace(to_replace = ( 'GOVER', 'GOVERNME','GOVERNMENT', 'GOVERM','GOVERN','Gover','Gove',
                                      'Governme','Governmen' ) ,value ='Government' , inplace=True)
df['installer'].replace(to_replace = 'Hesawa' ,value ='HESAWA' , inplace=True)
df['installer'].replace(to_replace = ('Colonial Government') , value ='Colonial government' , inplace=True)
df['installer'].replace(to_replace = ('Government of Misri') , value ='Misri Government' , inplace=True)
df['installer'].replace(to_replace = ('Italy government') , value ='Italian government' , inplace=True)
df['installer'].replace(to_replace = ('British colonial government') , value ='British government' , inplace=True)
df['installer'].replace(to_replace = ('Concern /government') , value ='Concern/Government' , inplace=True)
df['installer'].replace(to_replace = ('District Water Department', 'District water depar','Distric Water Department'),
                        value ='District water department' , inplace=True)
df['installer'].replace(to_replace = ('FinW','Fini water','FINI WATER'), value ='Fini Water' , inplace=True)
df['installer'].replace(to_replace = 'JAICA', value ='Jaica' , inplace=True)
df['installer'].replace(to_replace = ( 'District COUNCIL','COUN', 'DISTRICT COUNCIL', 
                                      'District Council','District Counci','Council','Counc','District  Council','Distri'),
                                    value ='District council' , inplace=True)
df['installer'].replace(to_replace = ( 'RC Churc', 'RC CHURCH','RC','RC C','RC Ch', 'RC CH','RC church', 
                                      'RC CATHORIC',) , value ='RC Church' , inplace=True)
df['installer'].replace(to_replace = ('Tanzanian Government','Tanzania Government','Central Government',
                                       'central government','Cental Government','Cebtral Government',
                                       'Centra Government' ,'CENTRAL GOVERNMENT', 'TANZANIAN GOVERNMENT','Central govt', 'Centr', 
                                      'Tanzania government','Centra govt') , value ='Central government' , inplace=True)
df['installer'].replace(to_replace = ('World vision', 'World Division','World Vision'),
                                        value ='world vision' , inplace=True)

df['installer'].replace(to_replace = ('Cetral government /RC') , value ='RC church/Central Gover' , inplace=True)
df['installer'].replace(to_replace = ('Government /TCRS','Government/TCRS') , value ='TCRS /Government' , inplace=True)
df['installer'].replace(to_replace = ('Village Government') , value ='Village government' , inplace=True)
df['installer'].replace(to_replace = ('Government and Community') , value ='Government /Community' , inplace=True)
df['installer'].replace(to_replace = ('ADRA /Government') , value ='ADRA/Government' , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [414]:
#check for missing values
df.isna().sum().sort_values(ascending=False)

amount_tsh               0
funder                   0
source                   0
quantity                 0
water_quality            0
payment                  0
management               0
extraction_type_group    0
construction_year        0
permit                   0
public_meeting           0
population               0
ward                     0
lga                      0
district_code            0
region                   0
basin                    0
latitude                 0
longitude                0
installer                0
gps_height               0
waterpoint_type          0
dtype: int64

In [415]:
#encoding categorical data using label encoding
labelencoder = LabelEncoder()
from category_encoders import OrdinalEncoder as oe
for col in (categorical_cols):
    df[col]=labelencoder.fit_transform(df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=labelencoder.fit_transform(df[col])


In [416]:
#get training data to X & y dataframes 
X=df[:59400]
y=df_train['status_group']

In [417]:
X

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,region,district_code,lga,...,public_meeting,permit,construction_year,extraction_type_group,management,payment,water_quality,quantity,source,waterpoint_type
0,6000.0,1548,1390,1649,34.938093,-9.856322,1,3,5,51,...,1,0,1999,1,7,2,6,1,8,1
1,0.0,522,1399,572,34.698766,-2.147466,4,9,2,103,...,1,1,2010,1,11,0,6,2,5,1
2,25.0,924,686,2318,37.460664,-3.821329,5,8,4,108,...,1,1,2009,1,7,4,6,1,0,2
3,0.0,1961,263,2035,38.486161,-11.155298,7,12,63,87,...,1,1,1986,10,7,0,6,0,3,2
4,0.0,20,0,132,31.130847,-1.825359,4,4,1,26,...,1,1,0,1,1,0,6,3,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,10.0,486,1210,226,37.169807,-3.253847,5,6,5,16,...,1,1,1999,1,9,4,6,1,8,1
59396,4700.0,196,1212,292,35.249991,-9.070629,6,3,4,91,...,1,1,1996,1,7,2,6,1,6,1
59397,0.0,1973,0,2036,34.017087,-8.750434,6,10,7,59,...,1,0,0,11,7,3,1,1,3,4
59398,0.0,992,0,1311,35.861315,-6.378573,6,2,4,11,...,1,1,0,5,7,0,6,2,7,4


In [418]:
y

0            functional
1            functional
2            functional
3        non functional
4            functional
              ...      
59395        functional
59396        functional
59397        functional
59398        functional
59399        functional
Name: status_group, Length: 59400, dtype: object

In [419]:
#train and get accuracy (using cross validation) for training set using Random Forest Classifier
def get_acc_rf(X,y):
    rf=RandomForestClassifier(criterion='gini',   
                          min_samples_split=5,
                          min_samples_leaf=1,
                          max_features='auto',
                          oob_score=False,
                          random_state=1,
                           n_jobs=-1
                          )
    score=cross_val_score(rf,X,y,cv=5)
    print("mean_score",score.mean())
    return score.mean()
get_acc_rf(X,y)

mean_score 0.8113131313131314


0.8113131313131314

In [109]:
#train and get accuracy (using cross validation) for training set using CatBoostClassifier
cat=CatBoostClassifier( verbose=False, iterations=1500,
    learning_rate=0.2053434310118264,
    random_strength=8,
    bagging_temperature=0,
    max_bin=20,
    grow_policy="Depthwise",
    min_data_in_leaf=10,
    max_depth=3,
    l2_leaf_reg=9.501510078266123e-06,
    one_hot_max_size=500,
    auto_class_weights="SqrtBalanced")

score=cross_val_score(cat,X,y,cv=5)
print("mean_score",score.mean())

mean_score 0.7792760942760942


In [420]:
#train dataset using Random Forest Classifier because it gave highest accuracy for the training set
rf=RandomForestClassifier(criterion='gini',   
                          min_samples_split=5,
                          min_samples_leaf=1,
                          max_features='auto',
                          oob_score=False,
                          random_state=1,
                           n_jobs=-1)
rf.fit(X, y)

RandomForestClassifier(min_samples_split=5, n_jobs=-1, random_state=1)

In [421]:
#evaluating using test data set
X_eval = df[59400:].copy()
pred=rf.predict(X_eval)
sub_df=pd.read_csv("Data/SubmissionFormat.csv",index_col="id")
sub_df['status_group']=pred
sub_df.to_csv("submission_file.csv")