Lambda School Data Science

*Unit 2, Sprint 2, Module 1*

---

In [1]:
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*
    !pip install pandas-profiling==2.*



In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import numpy as np

In [0]:

train = pd.merge(pd.read_csv(DATA_PATH+'waterpumps/train_features.csv'), 
                 pd.read_csv(DATA_PATH+'waterpumps/train_labels.csv'))
test = pd.read_csv(DATA_PATH+'waterpumps/test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')


In [0]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(train, train_size=0.80, test_size=0.20,
                              stratify=train['status_group'], random_state=42)

In [0]:
# def wrangle(X):
#     """Wrangle train, validate, and test sets in the same way"""
    
#     # Prevent SettingWithCopyWarning
#     X = X.copy()
    
#     # About 3% of the time, latitude has small values near zero,
#     # outside Tanzania, so we'll treat these values like zero.
#     X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
#     # When columns have zeros and shouldn't, they are like null values.
#     # So we will replace the zeros with nulls, and impute missing values later.
#     # Also create a "missing indicator" column, because the fact that
#     # values are missing may be a predictive signal.
#     cols_with_zeros = ['longitude', 'latitude', 'construction_year', 
#                        'gps_height', 'population']
#     for col in cols_with_zeros:
#         X[col] = X[col].replace(0, np.nan)
#         X[col+'_MISSING'] = X[col].isnull()
            
#     # Drop duplicate columns
#     duplicates = ['quantity_group', 'payment_type']
#     X = X.drop(columns=duplicates)
    
#     # Drop recorded_by (never varies) and id (always varies, random)
#     unusable_variance = ['recorded_by', 'id']
#     X = X.drop(columns=unusable_variance)
    
#     # Convert date_recorded to datetime
#     X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
#     # Extract components from date_recorded, then drop the original column
#     X['year_recorded'] = X['date_recorded'].dt.year
#     X['month_recorded'] = X['date_recorded'].dt.month
#     X['day_recorded'] = X['date_recorded'].dt.day
#     X = X.drop(columns='date_recorded')
    
#     # Engineer feature: how many years from construction_year to date_recorded
#     X['years'] = X['year_recorded'] - X['construction_year']
#     X['years_MISSING'] = X['years'].isnull()
    
#     # return the wrangled dataframe
#     return X

# train = wrangle(train)
# val = wrangle(val)
# test = wrangle(test)

In [6]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target & id
# take the train data frame and drop columns 'status_group' and 'id' then assign it to train_features
train_features = train.drop(columns=[target])
# , 'id'

# Get a list of the numeric features
# take the train_features data frame and only keep data types that are a number... 
# then convert the columns to a list and assign it to the variable numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
# take train_features and subset again - this time exclude data types of numbers
# then only keep the unique obs and assign it to cardinality
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features
print(features)

['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year', 'basin', 'region', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group']


In [0]:
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [9]:
# fit a decision tree classifier

import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_classif

pipeline = make_pipeline(
    # SelectKBest(f_classif, k=10),
    ce.OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    DecisionTreeClassifier(random_state=42, min_samples_leaf=10)
)

# Fit on training
pipeline.fit(X_train, y_train)

print ('Train Accuracy', pipeline.score(X_train, y_train))
print ('Validation Accuracy', pipeline.score(X_val, y_val))

Train Accuracy 0.839520202020202
Validation Accuracy 0.7668350168350169


In [0]:
# Graph feature importances

%matplotlib inline
import matplotlib.pyplot as plt

model = pipeline.named_steps['decisiontreeclassifier']

encoder = pipeline.named_steps['onehotencoder']
encoded_columns = encoder.transform(X_val).columns
importances = pd.Series( model.feature_importances_ , encoded_columns)

plt.figure(figsize=(10,30))
importances.sort_values().plot.barh()




In [0]:
importances.get_values

In [0]:
y_pred = pipeline.predict(X_test)

# Makes a dataframe with two columns, id and status_group, 
# and writes to a csv file, without the index

submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('kyle_tyler_sub4.csv', index=False)

In [0]:
from google.colab import files
files.download('kyle_tyler_sub4.csv')