<a href="https://colab.research.google.com/github/JimKing100/DS-Unit-2-Applied-Modeling/blob/master/Kaggle_Study_Guide.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Installs
%%capture
!pip install --upgrade category_encoders plotly

In [36]:
# Imports
import os, sys

os.chdir('/content')
!git init .
!git remote add origin https://github.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge.git
!git pull origin master

!pip install -r requirements.txt

os.chdir('module1')

Reinitialized existing Git repository in /content/.git/
fatal: remote origin already exists.
From https://github.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge
 * branch            master     -> FETCH_HEAD
Already up to date.
Collecting category_encoders==2.0.0 (from -r requirements.txt (line 1))
  Using cached https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl
Installing collected packages: category-encoders
  Found existing installation: category-encoders 2.1.0
    Uninstalling category-encoders-2.1.0:
      Successfully uninstalled category-encoders-2.1.0
Successfully installed category-encoders-2.0.0


In [0]:
# Imports
import pandas as pd
import numpy as np
import math

import sklearn
sklearn.__version__
from sklearn.model_selection import train_test_split

# Import the models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

# Import encoder and scaler and imputer
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Import random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [0]:
def wrangle(X):
  # Wrangles train, validate, and test sets
  X = X.copy()

  # Convert date_recorded to datetime
  X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
  # Extract components from date_recorded and drop the original column
  X['year_recorded'] = X['date_recorded'].dt.year
  X['month_recorded'] = X['date_recorded'].dt.month
  X['day_recorded'] = X['date_recorded'].dt.day
  X = X.drop(columns='date_recorded')
    
  # Engineer new feature years - construction_year to date_recorded
  X.loc[X['construction_year'] == 0, 'construction_year'] = np.nan
  X['years'] = X['year_recorded'] - X['construction_year']
    
  # Remove latitude outliers
  X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
  # Features with many zero's are likely nan's
  cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height',
                     'population', 'amount_tsh']
  for col in cols_with_zeros:
      X[col] = X[col].replace(0, np.nan)
    
  # Impute mean for years
  X.loc[X['years'].isna(), 'years'] = X['years'].mean()
  #X.loc[X['pump_age'].isna(), 'pump_age'] = X['pump_age'].mean()
    
  # Impute mean for longitude and latitude based on region
  average_lat = X.groupby('region').latitude.mean().reset_index()
  average_long = X.groupby('region').longitude.mean().reset_index()

  shinyanga_lat = average_lat.loc[average_lat['region'] == 'Shinyanga', 'latitude']
  shinyanga_long = average_long.loc[average_long['region'] == 'Shinyanga', 'longitude']

  X.loc[(X['region'] == 'Shinyanga') & (X['latitude'] > -1), ['latitude']] = shinyanga_lat[17]
  X.loc[(X['region'] == 'Shinyanga') & (X['longitude'].isna()), ['longitude']] = shinyanga_long[17]

  mwanza_lat = average_lat.loc[average_lat['region'] == 'Mwanza', 'latitude']
  mwanza_long = average_long.loc[average_long['region'] == 'Mwanza', 'longitude']

  X.loc[(X['region'] == 'Mwanza') & (X['latitude'] > -1), ['latitude']] = mwanza_lat[13]
  X.loc[(X['region'] == 'Mwanza') & (X['longitude'].isna()) , ['longitude']] = mwanza_long[13]
    
  # Clean installer
  X['installer'] = X['installer'].str.lower()
  X['installer'] = X['installer'].str[:4]
  X['installer'].value_counts(normalize=True)
  tops = X['installer'].value_counts()[:15].index
  X.loc[~X['installer'].isin(tops), 'installer'] = 'other'
    
  # Bin subvillage
  tops = X['subvillage'].value_counts()[:25].index
  X.loc[~X['subvillage'].isin(tops), 'subvillage'] = 'Other'
    
  # Impute mean for a feature based on latitude and longitude
  def latlong_conversion(feature, pop, long, lat):
    
    radius = 0.1
    radius_increment = 0.3
    
    if math.isnan(pop):
      pop_temp = 0
      while pop_temp <= 1 and radius <= 2:
        lat_from = lat - radius
        lat_to = lat + radius
        long_from = long - radius
        long_to = long + radius
        
        df = X[(X['latitude'] >= lat_from) & 
               (X['latitude'] <= lat_to) &
               (X['longitude'] >= long_from) &
               (X['longitude'] <= long_to)]
        
        pop_temp = df[feature].mean()
          
        radius = radius + radius_increment
    else:
      pop_temp = pop
      
    if np.isnan(pop_temp):
      new_pop = X_train[feature].mean()
    else:
      new_pop = pop_temp
   
    return new_pop
    
  X.loc[X['population'].isna(), 'population'] = X['population'].mean()
    
  # Drop unneeded columns
  unusable_variance = ['recorded_by', 'id', 'num_private', 'wpt_name']
  X = X.drop(columns=unusable_variance)
        
  return X
  
# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv('../data/waterpumps/train_features.csv'), 
                 pd.read_csv('../data/waterpumps/train_labels.csv'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv('../data/waterpumps/test_features.csv')
sample_submission = pd.read_csv('../data/waterpumps/sample_submission.csv')

# Split train into train & val. Make val the same size as test.
target = 'status_group'
train, val = train_test_split(train, train_size=0.80, test_size=0.20,  
                              stratify=train[target], random_state=42)

# Wrangle train, validate, and test sets in the same way
train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

# Arrange data into X features matrix and y target vector
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [39]:
X_test.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,year_recorded,month_recorded,day_recorded,years
0,,Dmdd,1996.0,other,35.290799,-4.059696,Internal,Other,Manyara,21,3,Mbulu,Bashay,321.0,True,Parastatal,,True,2012.0,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other,2013,2,4,1.0
1,,Government Of Tanzania,1569.0,dwe,36.656709,-3.309214,Pangani,Other,Arusha,2,2,Arusha Rural,Kimnyaki,300.0,True,VWC,TPRI pipe line,True,2000.0,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,2013,2,4,13.0
2,,,1567.0,other,34.767863,-5.004344,Internal,Other,Singida,13,2,Singida Rural,Puma,500.0,True,VWC,P,,2010.0,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other,2013,2,1,3.0
3,,Finn Water,267.0,other,38.058046,-9.418672,Ruvuma / Southern Coast,Other,Lindi,80,43,Liwale,Mkutano,250.0,,VWC,,True,1987.0,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other,2013,1,22,26.0
4,500.0,Bruder,1260.0,other,35.006123,-10.950412,Ruvuma / Southern Coast,Other,Ruvuma,10,3,Mbinga,Mbinga Urban,60.0,,Water Board,BRUDER,True,2000.0,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,2013,3,27,13.0


In [40]:
# Make pipeline!
pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='mean'), 
    RandomForestClassifier(n_estimators=500, 
                           random_state=42,
                           min_samples_split=5,
                           min_samples_leaf=1,
                           max_features='auto',
                           max_depth=30,
                           bootstrap=True,
                           n_jobs=-1,
                           verbose = 1)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)
print('Validation Accuracy', accuracy_score(y_val, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   39.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    1.4s finished


Validation Accuracy 0.8176767676767677


In [41]:
pd.set_option('display.max_rows', 200)
model = pipeline.named_steps['randomforestclassifier']
encoder = pipeline.named_steps['ordinalencoder']
encoded_columns = encoder.transform(X_train).columns 
importances = pd.Series(model.feature_importances_, encoded_columns)
importances.sort_values(ascending=False)

longitude                0.084068
latitude                 0.083611
quantity                 0.066962
quantity_group           0.066737
gps_height               0.045444
waterpoint_type          0.040546
day_recorded             0.039013
ward                     0.038048
construction_year        0.036405
years                    0.035888
population               0.032702
funder                   0.032491
extraction_type_class    0.030487
waterpoint_type_group    0.027936
scheme_name              0.025065
lga                      0.024756
extraction_type          0.017666
payment_type             0.017650
payment                  0.017279
district_code            0.017116
installer                0.017114
extraction_type_group    0.016782
amount_tsh               0.016245
month_recorded           0.015606
region                   0.015513
region_code              0.015439
source                   0.015058
basin                    0.014976
source_type              0.014462
scheme_managem

In [42]:
import eli5
from eli5.sklearn import PermutationImportance

permuter = PermutationImportance(
    model, 
    scoring='accuracy',
    n_iter=2,
    random_state=42
)

encoder = pipeline.named_steps['ordinalencoder']
imputer = pipeline.named_steps['simpleimputer']
X_val_encoded = encoder.transform(X_val)
X_val_processed = imputer.transform(X_val_encoded)

permuter.fit(X_val_processed, y_val)
feature_names = X_val.columns.tolist()

eli5.show_weights(
    permuter,
    top=None,
    feature_names = feature_names
)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    1.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    1.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    1.2s finished
[

Weight,Feature
0.0309  ± 0.0019,quantity
0.0241  ± 0.0013,quantity_group
0.0107  ± 0.0003,waterpoint_type
0.0090  ± 0.0004,latitude
0.0084  ± 0.0034,extraction_type_class
0.0075  ± 0.0006,longitude
0.0056  ± 0.0017,population
0.0043  ± 0.0001,waterpoint_type_group
0.0028  ± 0.0009,construction_year
0.0027  ± 0.0024,lga


In [49]:
majority_class = y_train.mode()[0]
majority_class

'functional'

In [50]:
y_pred = np.full_like(y_val, fill_value=majority_class)
y_pred

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'functional'], dtype=object)

In [51]:
accuracy_score(y_val, y_pred)

0.5430976430976431

In [53]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred))

  'precision', 'predicted', average, warn_for)


                         precision    recall  f1-score   support

             functional       0.54      1.00      0.70      6452
functional needs repair       0.00      0.00      0.00       863
         non functional       0.00      0.00      0.00      4565

               accuracy                           0.54     11880
              macro avg       0.18      0.33      0.23     11880
           weighted avg       0.29      0.54      0.38     11880



In [54]:
y_pred_proba = model.predict_proba(X_val_processed)[:, 0]
y_pred_proba

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    1.3s finished


array([0.35244376, 0.63585714, 0.77597492, ..., 0.4076373 , 0.5754416 ,
       0.0171381 ])

In [55]:
from sklearn.metrics import roc_auc_score

print('Test ROC AUC:', roc_auc_score(y_val, y_pred_proba))

ValueError: ignored