In [10]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, Imputer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

In [19]:
train = pd.merge(pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/train_features.csv')
                ,pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/train_labels.csv'))
test = pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/test_features.csv')

In [12]:
sample_submission = pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/sample_submission.csv')

In [5]:
train.shape, test.shape

((59400, 41), (14358, 40))

In [20]:
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'], random_state=42)
# 'quantity_group', 'management', 'scheme_management', 'extraction_type', 'extraction_type_group', 'waterpoint_type'
def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()

    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer feature: how many years from construction_year to date_recorded
    X['years'] = X['year_recorded'] - X['construction_year']    
    
    # Drop recorded_by (never varies) and id (always varies, random)
    X = X.drop(columns=['recorded_by', 'id'])
    
    # Drop duplicate columns
    duplicate_columns = ['payment_type', 'management_group', 'source_class', 'quality_group', 'source_type', 'waterpoint_type_group', 'extraction_type_class', 'management', 'extraction_type_group']
    X = X.drop(columns=duplicate_columns)
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these like null values
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
    # When columns have zeros and shouldn't, they are like null values
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height', 'population', 'wpt_name']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        
    # For categoricals with missing values, fill with the category 'MISSING'
    categoricals = X.select_dtypes(exclude='number').columns
    for col in categoricals:
        X[col] = X[col].fillna('MISSING')
    
    return X


train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [7]:
train.describe(include='all')

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,scheme_name,permit,construction_year,extraction_type_class,management_group,payment,payment_type,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type_group,status_group,year_recorded,month_recorded,day_recorded,years
count,47520.0,47520,31215.0,47520,46078.0,46078.0,47520,47520.0,47520,47520,47520,47520.0,47520.0,47520,47520,30454.0,47520,47520,47520,31003.0,47520,47520,47520,47520,47520,47520,47520,47520,47520,47520,47520,47520,47520.0,47520.0,47520.0,47520.0
unique,,1717,,1930,,,30661,,9,17232,21,,,124,2082,,3,2564,3,,7,5,7,7,8,6,5,10,7,3,6,3,,,,
top,,Government Of Tanzania,,DWE,,,none,,Lake Victoria,Shuleni,Iringa,,,Njombe,Igosi,,True,MISSING,True,,gravity,user-group,never pay,never pay,soft,good,enough,spring,spring,groundwater,communal standpipe,functional,,,,
freq,,7321,,13978,,,2879,,8137,420,4250,,,2003,257,,40838,22532,31071,,21448,42027,20287,20287,40598,40598,26567,13620,13620,36638,27642,25807,,,,
mean,321.93,,1019.31,,35.15,-5.88,,0.48,,,,15.26,5.62,,,280.57,,,,1996.83,,,,,,,,,,,,,2011.92,4.37,15.63,709.15
std,3197.24,,612.06,,2.6,2.81,,13.31,,,,17.53,9.62,,,553.49,,,,12.5,,,,,,,,,,,,,0.96,3.03,8.69,950.62
min,0.0,,-63.0,,29.61,-11.65,,0.0,,,,1.0,0.0,,,1.0,,,,1960.0,,,,,,,,,,,,,2002.0,1.0,1.0,-7.0
25%,0.0,,395.5,,33.28,-8.63,,0.0,,,,5.0,2.0,,,40.0,,,,1988.0,,,,,,,,,,,,,2011.0,2.0,8.0,8.0
50%,0.0,,1167.0,,35.01,-5.17,,0.0,,,,12.0,3.0,,,150.0,,,,2000.0,,,,,,,,,,,,,2012.0,3.0,16.0,26.0
75%,25.0,,1497.0,,37.22,-3.38,,0.0,,,,17.0,5.0,,,321.0,,,,2008.0,,,,,,,,,,,,,2013.0,7.0,23.0,2011.0


In [14]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()
cardinal_features = cardinality[cardinality > 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

In [18]:
cardinality.sort_values(ascending=False)

wpt_name             30661
subvillage           17232
scheme_name           2564
ward                  2082
installer             1930
funder                1717
lga                    124
region                  21
extraction_type         18
scheme_management       13
source                  10
basin                    9
water_quality            8
payment                  7
waterpoint_type          7
quantity                 5
quantity_group           5
public_meeting           3
permit                   3
dtype: int64

In [21]:
# Arrange data into X features matrix and y target vector 
X_train = train_features
y_train = train[target]
X_val = val.drop(columns=[target])
y_val = val[target]
X_test = test

In [24]:

pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestClassifier(n_estimators=1600, max_depth=40, n_jobs=-1)
)
pipeline.fit(X_train, y_train)
print(pipeline.score(X_val, y_val))

0.8122053872053872


In [None]:
import xgboost as xgb
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    xgb.XGBClassifier(learning_rate = 0.1,
                        n_estimators=1000,
                        max_depth=8,
                        min_chld_weight=1,
                        objective='binary:logistic',
                       )
)
pipeline.fit(X_train, y_train)
print(pipeline.score(X_val, y_val))

In [49]:
y_pred = pipeline.predict(X_test)

# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-07.csv', index=False)