<a href="https://colab.research.google.com/github/JimKing100/DS-Unit-2-Kaggle-Challenge/blob/master/Kaggle_Challenge_Test_%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Installs
%%capture
!pip install --upgrade category_encoders plotly

In [177]:
# Imports
import os, sys

os.chdir('/content')
!git init .
!git remote add origin https://github.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge.git
!git pull origin master

!pip install -r requirements.txt

os.chdir('module1')

Reinitialized existing Git repository in /content/.git/
fatal: remote origin already exists.
From https://github.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge
 * branch            master     -> FETCH_HEAD
Already up to date.


In [0]:
# Imports
import pandas as pd
import numpy as np
import math

import sklearn
sklearn.__version__
from sklearn.model_selection import train_test_split

# Import the models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

# Import encoder and scaler and imputer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Import random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [193]:
# Wrangles train, validate, and test sets in the same way
def wrangle(X):
    
    X = X.copy()

    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer new feature years,  year_recorded - construction_year
    X.loc[X['construction_year'] == 0, 'construction_year'] = np.nan
    X['years'] = X['year_recorded'] - X['construction_year']  
    
    # Engineer new feature pump_age 
    X['pump_age'] = 2013 - X['construction_year']
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these like null values
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
    # When columns have zeros and shouldn't, they are like null values
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height',
                       'population', 'amount_tsh']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        
    # Impute mean for longitude and latitude based on region
    average_lat = X.groupby('region').latitude.mean().reset_index()
    average_long = X.groupby('region').longitude.mean().reset_index()

    shinyanga_lat = average_lat.loc[average_lat['region'] == 'Shinyanga', 'latitude']
    shinyanga_long = average_long.loc[average_lat['region'] == 'Shinyanga', 'longitude']

    X.loc[(X['region'] == 'Shinyanga') & (X['latitude'] > -1), ['latitude']] = shinyanga_lat[17]
    X.loc[(X['region'] == 'Shinyanga') & (X['longitude'].isna()), ['longitude']] = shinyanga_long[17]

    mwanza_lat = average_lat.loc[average_lat['region'] == 'Mwanza', 'latitude']
    mwanza_long = average_long.loc[average_lat['region'] == 'Mwanza', 'longitude']

    X.loc[(X['region'] == 'Mwanza') & (X['latitude'] > -1), ['latitude']] = mwanza_lat[13]
    X.loc[(X['region'] == 'Mwanza') & (X['longitude'].isna()) , ['longitude']] = mwanza_long[13]
    
    # Impute mean for tsh based on mean of source_class/basin/waterpoint_type_group
    def tsh_calc(tsh, source, base, waterpoint):
      if math.isnan(tsh):
        if (source, base, waterpoint) in tsh_dict:
          new_tsh = tsh_dict[source, base, waterpoint]
          return new_tsh
      else:
        return tsh
      return tsh
  
    temp = X[~X['amount_tsh'].isna()].groupby(['source_class',
                                               'basin',
                                               'waterpoint_type_group'])['amount_tsh'].mean()
    tsh_dict = dict(temp)
    X['amount_tsh'] = X.apply(lambda x: tsh_calc(x['amount_tsh'], x['source_class'], x['basin'], x['waterpoint_type_group']), axis=1)
    
  # Impute mean for the feature based on latitude and longitude
    def latlong_conversion(feature, pop, long, lat):
    
      radius = 0.1
      radius_increment = 0.3
    
      if math.isnan(pop):
        pop_temp = 0
        while pop_temp <= 1 and radius <= 2:
          lat_from = lat - radius
          lat_to = lat + radius
          long_from = long - radius
          long_to = long + radius
        
          df = X[(X['latitude'] >= lat_from) & 
                 (X['latitude'] <= lat_to) &
                 (X['longitude'] >= long_from) &
                 (X['longitude'] <= long_to)]
        
          pop_temp = df[feature].mean()
          
          radius = radius + radius_increment
      else:
        pop_temp = pop
      
      if np.isnan(pop_temp):
        new_pop = X_train[feature].mean()
      else:
        new_pop = pop_temp
   
      return new_pop
    
    # Impute gps_height based on location
    #X['population'] = X.apply(lambda x: latlong_conversion('population', x['population'], x['longitude'], x['latitude']), axis=1)
    
    # Clean installer
    X['installer'] = X['installer'].str.lower()
    X['installer'] = X['installer'].str[:3]
    X['installer'].value_counts(normalize=True)
    tops = X['installer'].value_counts()[:15].index
    X.loc[~X['installer'].isin(tops), 'installer'] = 'other'
    
    # Clean funder and bin
    X['funder'] = X['funder'].str.lower()
    X['funder'].value_counts(normalize=True)
    tops = X['funder'].value_counts()[:15].index
    X.loc[~X['funder'].isin(tops), 'funder'] = 'other'
    
    # Drop unneeded columns
    unusable_variance = ['recorded_by', 'id', 'num_private','wpt_name', 'extraction_type_class',
                         'quality_group', 'source_type', 'quantity_group', 'payment_type',
                         'extraction_type_group']
    X = X.drop(columns=unusable_variance)
      
    return X

# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv('../data/tanzania/train_features.csv'), 
                 pd.read_csv('../data/tanzania/train_labels.csv'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv('../data/tanzania/test_features.csv')
sample_submission = pd.read_csv('../data/tanzania/sample_submission.csv')

# Split train into train & val. Make val the same size as test.
target = 'status_group'
train, val = train_test_split(train, train_size = 0.80, test_size = 0.20, 
                              stratify=train[target], random_state=42)

# Wrangle train, validate, and test sets in the same way
train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

train.head(25)

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,management,management_group,payment,water_quality,quantity,source,source_class,waterpoint_type,waterpoint_type_group,status_group,year_recorded,month_recorded,day_recorded,years,pump_age
43360,1415.482234,other,,other,33.542898,-9.174777,Lake Nyasa,Mpandapanda,Mbeya,12,4,Rungwe,Kiwira,,True,VWC,K,,,gravity,vwc,user-group,never pay,soft,insufficient,spring,groundwater,communal standpipe,communal standpipe,functional,2011,7,27,,
7263,500.0,other,2049.0,other,34.66576,-9.308548,Rufiji,Kitichi,Iringa,11,4,Njombe,Imalinyi,175.0,True,WUA,Tove Mtwango gravity Scheme,True,2008.0,gravity,wua,user-group,pay monthly,soft,enough,spring,groundwater,communal standpipe,communal standpipe,functional,2011,3,23,3.0,5.0
2486,25.0,other,290.0,other,38.238568,-6.179919,Wami / Ruvu,Kwedigongo,Pwani,6,1,Bagamoyo,Mbwewe,2300.0,True,VWC,,False,2010.0,india mark ii,vwc,user-group,pay per bucket,salty,insufficient,shallow well,groundwater,hand pump,hand pump,functional,2011,3,7,1.0,3.0
313,391.538462,government of tanzania,,dwe,30.716727,-1.289055,Lake Victoria,Kihanga,Kagera,18,1,Karagwe,Isingiro,,True,,,True,,other,vwc,user-group,never pay,soft,enough,shallow well,groundwater,other,other,non functional,2011,7,31,,
52726,1267.552189,other,,gov,35.389331,-6.399942,Internal,Mtakuj,Dodoma,1,6,Bahi,Nondwa,,True,VWC,Zeje,True,,mono,vwc,user-group,pay per bucket,soft,enough,machine dbh,groundwater,communal standpipe,communal standpipe,functional,2011,3,10,,
8558,585.603261,other,1295.0,dwe,31.214583,-8.431428,Lake Tanganyika,Kisumba Kati,Rukwa,15,2,Sumbawanga Rural,Kasanga,200.0,True,VWC,Kisumba water supply,True,1986.0,gravity,vwc,user-group,never pay,soft,insufficient,river,surface,communal standpipe,communal standpipe,functional,2011,8,7,25.0,27.0
2559,20000.0,other,1515.0,dwe,36.6967,-3.337926,Pangani,Oroirwa,Arusha,2,2,Arusha Rural,Oltroto,150.0,True,VWC,Nabaiye pipe line,True,1995.0,gravity,vwc,user-group,pay monthly,soft,insufficient,spring,groundwater,communal standpipe multiple,communal standpipe,functional,2013,9,3,18.0,18.0
54735,1267.552189,other,,dwe,36.292724,-5.177333,Internal,Polisi,Dodoma,1,1,Kondoa,Mrijo,,True,VWC,Mrij,False,,mono,vwc,user-group,pay per bucket,soft,enough,machine dbh,groundwater,communal standpipe,communal standpipe,functional,2011,4,17,,
25763,2900.425,danida,,dan,32.877248,-8.925921,Lake Rukwa,Bagamoyo,Mbeya,12,6,Mbozi,Halungu,,False,VWC,,False,,swn 80,vwc,user-group,never pay,soft,enough,machine dbh,groundwater,hand pump,hand pump,non functional,2011,8,3,,
44540,391.538462,other,,other,33.014412,-3.115869,Lake Victoria,Mwanzuki,Mwanza,19,7,Missungwi,Shilalo,,True,VWC,,True,,submersible,vwc,user-group,pay monthly,soft,enough,machine dbh,groundwater,other,other,non functional,2011,8,3,,


In [197]:
# Arrange data into X features matrix and y target vector
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

# Make pipeline!
pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='mean'), 
    RandomForestClassifier(n_estimators=1400, 
                           random_state=42,
                           min_samples_split=5,
                           min_samples_leaf=1,
                           max_features='auto',
                           max_depth=30,
                           bootstrap=True,
                           n_jobs=-1,
                           verbose = 1)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
print('Validation Accuracy', accuracy_score(y_val, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:  1.8min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    3.3s
[Parallel(n_jobs=2)]: Done 1400 out of 1400 | elapsed:    3.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent worker

Pipeline Score 0.8167508417508418
Validation Accuracy 0.8167508417508418


[Parallel(n_jobs=2)]: Done 1400 out of 1400 | elapsed:    3.7s finished


In [196]:
pd.set_option('display.max_rows', 200)
model = pipeline.named_steps['randomforestclassifier']
encoder = pipeline.named_steps['ordinalencoder']
encoded_columns = encoder.transform(X_train).columns 
importances = pd.Series(model.feature_importances_, encoded_columns)
importances.sort_values(ascending=False)

quantity                 0.102912
longitude                0.081735
latitude                 0.080759
subvillage               0.053122
waterpoint_type          0.049363
gps_height               0.044913
ward                     0.039264
day_recorded             0.039024
waterpoint_type_group    0.034103
population               0.032443
extraction_type          0.032289
years                    0.030502
pump_age                 0.030280
construction_year        0.030059
amount_tsh               0.029708
payment                  0.027007
lga                      0.026850
scheme_name              0.026775
source                   0.020659
district_code            0.018522
installer                0.018065
funder                   0.017340
region                   0.016466
region_code              0.016266
month_recorded           0.015979
basin                    0.014294
scheme_management        0.013609
management               0.012457
water_quality            0.012262
public_meeting

In [181]:
assert all(X_test.columns == X_train.columns)

y_pred = pipeline.predict(X_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.4s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    2.5s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    4.0s
[Parallel(n_jobs=2)]: Done 1400 out of 1400 | elapsed:    4.4s finished


In [0]:
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('/content/submission-1a.csv', index=False)