In [86]:
# Importing necessary libraries
import pandas as pd
from myfunctions import set_importer, scores, data_preparation, data_cleaner
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix, train_test_split
from sklearn.ensemble import RandomForestClassifier

sns.set_style("whitegrid")
%matplotlib inline

# setting font size for all plots
font = {'size'   : 16}

plt.rc('font', **font)

# The best Model

After experimenting with various models, a random forest classifier was found to be the ideal model

In [75]:
class myRandomForestClassifier:
    def __init__(self):
        self.rf_classifier =  RandomForestClassifier(criterion='gini', random_state=21, min_samples_split=70, min_samples_leaf=100, n_estimators=300)
    
    def Xy_cleanprep(self, X, y):
        #1. data cleaning
        X_cleaned, y_cleaned = data_cleaner(X, y)

        # 2. data preparation
        X_ordinal_encoded, X_onehotencoded, y_prepared = data_preparation(X_cleaned, y_cleaned)

        columns_to_select = ['gps_height','longitude','latitude','region_code','district_code',
        'population','permit','construction_year','basin','extraction_type_class','payment','quality_group','quantity_group','source_class','waterpoint_type_group']

        return X_ordinal_encoded[columns_to_select], y_prepared
        
    
    def fit(self, X, y):
        
        X_ready, y_ready = self.Xy_cleanprep(X, y)

        return self.rf_classifier.fit(X_ready, y_ready)
    
    def predict(self, X_true):
        columns_to_select = ['gps_height','longitude','latitude','region_code','district_code',
        'population','permit','construction_year','basin','extraction_type_class','payment','quality_group','quantity_group','source_class','waterpoint_type_group']
        return self.rf_classifier.predict(X_true[columns_to_select])



In [76]:
# importing the data frame
df = set_importer('../analysis-dfs/df.csv')
df.head(10)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
1,19816,0.0,2012-10-01,Dwsp,0,DWSP,33.36241,-3.766365,Kwa Ngomho,0,Internal,Ishinabulandi,Shinyanga,17,3,Shinyanga Rural,Samuye,0,True,GeoData Consultants Ltd,VWC,,True,0,swn 80,swn 80,handpump,vwc,user-group,never pay,never pay,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,non functional
2,54551,0.0,2012-10-09,Rwssp,0,DWE,32.620617,-4.226198,Tushirikiane,0,Lake Tanganyika,Nyawishi Center,Shinyanga,17,3,Kahama,Chambo,0,True,GeoData Consultants Ltd,,,True,0,nira/tanira,nira/tanira,handpump,wug,user-group,unknown,unknown,milky,milky,enough,enough,shallow well,shallow well,groundwater,hand pump,hand pump,non functional
3,53934,0.0,2012-11-03,Wateraid,0,Water Aid,32.7111,-5.146712,Kwa Ramadhan Musa,0,Lake Tanganyika,Imalauduki,Tabora,14,6,Tabora Urban,Itetemia,0,True,GeoData Consultants Ltd,VWC,,True,0,india mark ii,india mark ii,handpump,vwc,user-group,never pay,never pay,salty,salty,seasonal,seasonal,machine dbh,borehole,groundwater,hand pump,hand pump,non functional
4,48451,500.0,2011-07-04,Unicef,1703,DWE,34.642439,-9.106185,Kwa John Mtenzi,0,Rufiji,Kidudumo,Iringa,11,4,Njombe,Mdandu,35,True,GeoData Consultants Ltd,WUA,wanging'ombe water supply s,True,1978,gravity,gravity,gravity,wua,user-group,pay monthly,monthly,soft,good,dry,dry,river,river/lake,surface,communal standpipe,communal standpipe,non functional
5,58155,0.0,2011-09-04,Unicef,1656,DWE,34.569266,-9.085515,Kwa Rose Chaula,0,Rufiji,Yeriko,Iringa,11,4,Njombe,Usuka,50,True,GeoData Consultants Ltd,WUA,wanging'ombe water supply s,True,1978,gravity,gravity,gravity,wua,user-group,pay when scheme fails,on failure,soft,good,dry,dry,river,river/lake,surface,communal standpipe,communal standpipe,non functional
6,34169,0.0,2011-07-22,Hesawa,1162,DWE,32.920154,-1.947868,Ngomee,0,Lake Victoria,Center,Mwanza,19,1,Ukerewe,Ilangala,1000,,GeoData Consultants Ltd,,,True,1999,other,other,other,vwc,user-group,never pay,never pay,milky,milky,insufficient,insufficient,spring,spring,groundwater,other,other,functional needs repair
7,58500,0.0,2011-10-04,Unicef,1510,DWE,34.586901,-8.980014,Shuleni,0,Rufiji,Mkanivega,Iringa,11,4,Njombe,Usuka,350,True,GeoData Consultants Ltd,WUA,wanging'ombe water supply s,True,1978,gravity,gravity,gravity,wua,user-group,never pay,never pay,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional needs repair
8,22308,0.0,2013-02-25,Government Of Tanzania,1273,DWE,37.422751,-3.317536,Kwa Tukai,0,Pangani,Kitereni,Kilimanjaro,3,4,Moshi Rural,Kimochi,140,True,GeoData Consultants Ltd,VWC,Komaka mandaka,True,1974,gravity,gravity,gravity,vwc,user-group,unknown,unknown,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional needs repair
9,19685,0.0,2013-03-05,Government Of Tanzania,1443,District council,37.611126,-3.263526,Kwa Kibakaya,0,Pangani,Kiyao,Kilimanjaro,3,1,Rombo,Mengwe Manda,1,True,GeoData Consultants Ltd,Company,Kitukuni water supply,True,2000,gravity,gravity,gravity,company,commercial,pay monthly,monthly,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,non functional


In [77]:
X = df.drop('status_group', axis=1)
y = df.status_group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=21)

In [78]:
clf = myRandomForestClassifier()
clf.fit(X_train, y_train)

In [79]:
# passing  the test set to a cleaner
X_test_ready, y_test_ready = clf.Xy_cleanprep(X_test, y_test)

In [85]:
y_test_rf_preds = clf.predict(X_test_ready)
# print('RandomForest Classifier best model results on the training set')
scores(y_test_ready, y_test_rf_preds)

The precision score is:	 0.8478688524590164
The recall score is:	 0.9998453448809156
The accuracy score is:	 0.8477576711250984
The f1 score is:	 0.9176069831807537


## Interpretating the final model

* Precision-Score - If the model assigns 1 then there is a 85% chance that a water pump is actually non functional.
* Recal-Score - If a given water pump is actually 1 (non-functional) then there is a 99.9% that this model will label it as 1 (non-functional), and a 0.1% chance that it will incorrectly label it as 0 (functional but need repair).
* Accuracy-Score - The model accrately assigns 84.8% to the correct label. (0.8375295043273013)