In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [55]:

DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv(DATA_PATH+'waterpumps/train_features.csv'), 
                 pd.read_csv(DATA_PATH+'waterpumps/train_labels.csv'))
train = train.replace({'functional': 0, 'non functional': 2, 'functional needs repair': 1})
# Read test_features.csv & sample_submission.csv
test = pd.read_csv(DATA_PATH+'waterpumps/test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')

In [56]:
train, val = train_test_split(train, train_size=0.80,
                              test_size=0.20, stratify=train['status_group'], random_state=42)
train.shape, val.shape, test.shape

((47520, 41), (11880, 41), (14358, 40))

In [57]:
train.select_dtypes(include='number').describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,47520.0,37037.915699,21412.099719,0.0,18482.75,36986.5,55450.25,74247.0
amount_tsh,47520.0,321.925261,3197.240487,0.0,0.0,0.0,25.0,350000.0
gps_height,47520.0,669.567656,693.005745,-63.0,0.0,372.5,1320.0,2770.0
longitude,47520.0,34.082431,6.552053,0.0,33.092142,34.91074,37.170578,40.3443
latitude,47520.0,-5.705946,2.941332,-11.64944,-8.528215,-5.021436,-3.327185,-2e-08
num_private,47520.0,0.477736,13.312977,0.0,0.0,0.0,0.0,1776.0
region_code,47520.0,15.258291,17.530228,1.0,5.0,12.0,17.0,99.0
district_code,47520.0,5.616751,9.62123,0.0,2.0,3.0,5.0,80.0
population,47520.0,179.805513,463.081564,0.0,0.0,25.0,215.0,15300.0
construction_year,47520.0,1302.768939,950.955437,0.0,0.0,1986.0,2004.0,2013.0


In [58]:
train.select_dtypes(exclude='number').describe().T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq
recorded_by,47520,1,GeoData Consultants Ltd,47520
public_meeting,44876,2,True,40838
permit,45077,2,True,31071
source_class,47520,3,groundwater,36638
management_group,47520,5,user-group,42027
quantity_group,47520,5,enough,26567
quantity,47520,5,enough,26567
waterpoint_type_group,47520,6,communal standpipe,27642
quality_group,47520,6,good,40598
payment_type,47520,7,never pay,20287


In [59]:
def wrangle(X):
    """Wrangle train, validate, and test sets in the same way"""
    
    # Prevent SettingWithCopyWarning (in acordince with lectuere)
    X = X.copy()
    
    # About 3% of the time, latitude has small values near zero
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # replace the zeros with nulls.
    cols_with_zeros = ['longitude', 'latitude']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
            
    cols_with_zeros = ['population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
    
    cols_with_zeros = ['construction_year']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)

    cols_with_zeros = ['amount_tsh']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
    
    # duplicates, so drop them
    X = X.drop(columns='quantity')
    X = X.drop(columns='quality_group')
    X = X.drop(columns='waterpoint_type_group')
    X = X.drop(columns='extraction_type_group')
    X = X.drop(columns='extraction_type_class')
    X = X.drop(columns='payment')
    X = X.drop(columns='source_type')
    X = X.drop(columns='scheme_management')
 
    X['year_inspected'] = pd.DatetimeIndex(X['date_recorded']).year
    X['years_since_inspection'] = X['year_inspected'] - X['construction_year']
    X = X.drop(columns='year_inspected')
    
    X['Permited_and_Public'] = X['public_meeting'] & X['permit']
    
      # return the wrangled dataframe
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

# creating target and features lsts
target = 'status_group'
train_features = train.drop(columns=[target, 'id'])
numeric_features = train_features.select_dtypes(include='number').columns.tolist()
cardinality = train_features.select_dtypes(exclude='number').nunique()
categorical_features = cardinality[cardinality <= 50].index.tolist()
features = numeric_features + categorical_features
print(features)

['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year', 'years_since_inspection', 'basin', 'region', 'public_meeting', 'recorded_by', 'permit', 'extraction_type', 'management', 'management_group', 'payment_type', 'water_quality', 'quantity_group', 'source', 'source_class', 'waterpoint_type', 'Permited_and_Public']


In [63]:
# creating Train, Validation, and Test vars
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='most_frequent'), 
    RandomForestClassifier(n_estimators=350, random_state=42, n_jobs=-1)
)

# Fit on train, score on val
#pipeline.fit(X_train, y_train)
#print('Validation Accuracy', pipeline.score(X_val, y_val))
k=3
scores = cross_val_score(pipeline, X_train, y_train, cv=k, 
                         scoring='neg_mean_absolute_error')
print(f'MAE for {k} folds:', -scores)
pipeline.fit(X_train, y_train)
print('Validation Accuracy', pipeline.score(X_val, y_val))

MAE for 3 folds: [0.33676303 0.32571501 0.334112  ]
Validation Accuracy 0.8131313131313131


In [45]:
train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,management_group,payment_type,water_quality,quantity_group,source,source_class,waterpoint_type,status_group,years_since_inspection,Permited_and_Public
43360,72938,,2011-07-27,,0,,33.542898,-9.174777,Kwa Mzee Noa,0,...,user-group,never pay,soft,insufficient,spring,groundwater,communal standpipe,functional,,False
7263,65358,500.0,2011-03-23,Rc Church,2049,ACRA,34.66576,-9.308548,Kwa Yasinta Ng'Ande,0,...,user-group,monthly,soft,enough,spring,groundwater,communal standpipe,functional,3.0,True
2486,469,25.0,2011-03-07,Donor,290,Do,38.238568,-6.179919,Kwasungwini,0,...,user-group,per bucket,salty,insufficient,shallow well,groundwater,hand pump,functional,1.0,False
313,1298,,2011-07-31,Government Of Tanzania,0,DWE,30.716727,-1.289055,Kwajovin 2,0,...,user-group,never pay,soft,enough,shallow well,groundwater,other,non functional,,True
52726,27001,,2011-03-10,Water,0,Gove,35.389331,-6.399942,Chama,0,...,user-group,per bucket,soft,enough,machine dbh,groundwater,communal standpipe,functional,,True
