# Ideas

Have histograms distributions for the other variables within each variable section.

Could lump together needs repair/non functional

Have variables like "installer = funder" and such. Those variables seem to be very similar.

Number of functional wells over the years, non functional wells over the years, etc.

Use SMOTE for data with no null values, all known, and no one-time value variables.

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt



In [2]:
X_test = pd.read_csv("tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [3]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [4]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

In [5]:
# df['fundernum'] = df['funder'].map(df.funder.value_counts())

# df['funder_installer'] = df['funder'] == df['installer']
# df['funder_installer'] = df['funder_installer'].astype('int')

# df['permit'] = df['permit'].map({True: 1, False: 0, 'Unknown': 2})

# df['status_id'] = df['status_group'].map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

# Which variables will be good in a model?


* amount_tsh
* gps_height
* region (not region_code)
* district_code
* population
* public_meeting
* scheme_management
* Permit
* construction_year
* extraction_type_group or extraction_type_class
* payment
* management
* quality_group
* quantity
* source
* waterpoint_type

# Defining the train and test sets

In [6]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'region', 
           'district_code', 'population', 'public_meeting', 
           'scheme_management', 'permit', 
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

  X_cat = pd.get_dummies(X_cat)


# Base Model – Logistic Regression, No Regularization

In [7]:
base_model = LogisticRegression(solver='liblinear', fit_intercept=False)

base_model.fit(X_train, y_train)

base_y_hat_train = base_model.predict(X_train)
base_y_hat_test = base_model.predict(X_test)

accuracy_score(y_train, base_y_hat_train)

0.6942087542087542

# Second Model – Decision Tree

In [34]:
dtc = DecisionTreeClassifier()

In [36]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [5, 10, 20, 40],
    'min_samples_leaf': [5, 10, 20],
    'splitter': ['best', 'random']
}

gs_tree = GridSearchCV(dtc, param_grid, cv=3)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'splitter': 'random'}

In [37]:
dtc = DecisionTreeClassifier(criterion= 'gini', max_depth= 10, min_samples_split= 10, min_samples_leaf=5, splitter='random')

In [38]:
dtc.fit(X_train, y_train)

In [39]:
dtc_y_hat_train = dtc.predict(X_train)
dtc_y_hat_test = dtc.predict(X_test)

accuracy_score(y_train, dtc_y_hat_train)

0.7468911335578002

In [13]:
# plt.figure(figsize=(12,12), dpi=500)
# tree.plot_tree(dtc, 
#                feature_names=X_train.columns,
#                class_names=np.unique(y).astype('str'),
#                filled=True, rounded=True)
# plt.show()

# Third Model – K Nearest Neighbors

In [14]:
clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train, y_train)

In [15]:
# clf_y_hat_train = clf.predict(X_train)
# clf_y_hat_test = clf.predict(X_test)

# accuracy_score(y_train, clf_y_hat_train)

In [16]:
# for sample in tqdm([X_train], desc="Predicting"):
#     clf_y_hat_train = clf.predict(sample)  # Your prediction step here

In [17]:
# for sample in tqdm([X_test], desc="Predicting"):
#     clf_y_hat_test = knn.predict(sample)  # Your prediction step here

In [18]:
testdf1 = X_train.iloc[:5]
testdf1

Unnamed: 0,amount_tsh,gps_height,population,region_Arusha,region_Dar es Salaam,region_Dodoma,region_Iringa,region_Kagera,region_Kigoma,region_Kilimanjaro,...,source_shallow well,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
33449,-0.100679,-0.127923,-0.336825,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,1.59072,-0.634523,-0.03215,-0.043722,-0.961071,-0.338217,-0.009476,1.547264,-0.115152,-0.347011
52548,-0.100679,-0.966917,-0.378774,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,-0.628646,-0.634523,-0.03215,-0.043722,-0.961071,-0.338217,-0.009476,1.547264,-0.115152,-0.347011
59100,-0.025416,0.868202,4.759981,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,4.492663,-0.283998,...,-0.628646,-0.634523,-0.03215,-0.043722,1.040505,-0.338217,-0.009476,-0.646302,-0.115152,-0.347011
18730,-0.100679,-0.966917,-0.378774,-0.243002,-0.11694,-0.194568,-0.315583,4.109061,-0.222585,-0.283998,...,-0.628646,1.575987,-0.03215,-0.043722,-0.961071,-0.338217,-0.009476,-0.646302,8.684186,-0.347011
50136,0.049847,-0.530121,0.355334,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,1.59072,-0.634523,-0.03215,-0.043722,-0.961071,-0.338217,-0.009476,1.547264,-0.115152,-0.347011


In [19]:
testdf2 = y_train.iloc[:5]
testdf2

33449    non functional
52548    non functional
59100        functional
18730        functional
50136        functional
Name: status_group, dtype: object

In [20]:
testdf3 = X_test.iloc[:5]
testdf3

Unnamed: 0,amount_tsh,gps_height,population,region_Arusha,region_Dar es Salaam,region_Dodoma,region_Iringa,region_Kagera,region_Kigoma,region_Kilimanjaro,...,source_shallow well,source_spring,source_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
22883,-0.100679,-0.890513,-0.284389,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,-0.628646,-0.634523,-0.03215,-0.043722,1.040505,-0.338217,-0.009476,-0.646302,-0.115152,-0.347011
7704,-0.100679,1.222828,0.355334,4.115189,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,-0.628646,-0.634523,-0.03215,-0.043722,1.040505,-0.338217,-0.009476,-0.646302,-0.115152,-0.347011
42571,-0.091648,-0.701668,-0.169029,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,-0.628646,-0.634523,-0.03215,-0.043722,1.040505,-0.338217,-0.009476,-0.646302,-0.115152,-0.347011
27049,0.200372,-0.282171,-0.376676,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,-0.628646,-0.634523,-0.03215,-0.043722,-0.961071,-0.338217,-0.009476,1.547264,-0.115152,-0.347011
17698,-0.100679,-0.4739,0.820968,-0.243002,-0.11694,-0.194568,-0.315583,-0.243365,-0.222585,-0.283998,...,-0.628646,-0.634523,-0.03215,-0.043722,1.040505,-0.338217,-0.009476,-0.646302,-0.115152,-0.347011


In [21]:
clf = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
clf.fit(testdf1, testdf2)

In [22]:
# clf.predict(testdf3)

# Bagging Classifier

In [59]:
bagged_tree = BaggingClassifier(estimator=DecisionTreeClassifier(criterion= 'gini', max_depth= 10, min_samples_split= 10, min_samples_leaf=5, splitter='random'), n_estimators=1000)
bagged_tree.fit(X_train, y_train)

In [60]:
bagged_tree.score(X_train, y_train)

0.7557126823793491

# Random Forest

In [56]:
forest = RandomForestClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [5, 10, 20, 40],
    'min_samples_leaf': [5, 10, 20]
}

gs_forest = GridSearchCV(forest, param_grid, cv=3)
gs_forest.fit(X_train, y_train)

gs_forest.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 10}

In [58]:
# Training accuracy score
gs_forest.score(X_train, y_train)

0.7477890011223345

# XGBoost

In [27]:
xgboost_y_train = y_train.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

In [28]:
# clf = XGBClassifier()
# clf.fit(X_train, xgboost_y_train)

In [29]:
# training_preds = clf.predict(X_train)
# test_preds = clf.predict(X_test)

In [30]:
# training_accuracy = accuracy_score(xgboost_y_train, training_preds)

In [31]:
# training_accuracy

# Bagging Classifier with GridSearchCV

In [None]:
bagged_tree = BaggingClassifier(DecisionTreeClassifier(), n_estimators=20)
bagged_tree.fit(X_train, y_train)

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [5, 10, 20]
}

gs_tree = GridSearchCV(dtc, param_grid, cv=3)
gs_tree.fit(X_train, y_train)

gs_tree.best_params_