In [1]:
import category_encoders as ce
from fancyimpute import IterativeImputer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
from numpy import loadtxt
pd.set_option('display.float_format', '{:.2f}'.format)

Using TensorFlow backend.


## Read data and split into train/val

In [2]:
# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P'), 
                 pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

In [3]:
# split data into train and validation sets
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'])

train.shape, val.shape, test.shape, sample_submission.shape

((47520, 41), (11880, 41), (14358, 40), (14358, 2))

## Wrangle data

In [4]:
def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()

    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer feature: how many years from construction_year to date_recorded
    X['years'] = X['year_recorded'] - X['construction_year']    
    
    # Drop recorded_by (never varies) and id (always varies, random)
    unusable_variance = ['recorded_by', 'id']
    X = X.drop(columns=unusable_variance)
    
    # Drop duplicate columns
    duplicate_columns = ['quantity_group']
    X = X.drop(columns=duplicate_columns)
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these like null values
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
    # When columns have zeros and shouldn't, they are like null values
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        
#     # For categoricals with missing values, fill with the category 'MISSING'
#     categoricals = X.select_dtypes(exclude='number').columns
#     for col in categoricals:
#         X[col] = X[col].fillna('MISSING')
    
    return X

In [5]:
train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

## Select features 

In [6]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

In [7]:
cardinality

funder                    1686
installer                 1933
wpt_name                 30667
basin                        9
subvillage               17297
region                      21
lga                        125
ward                      2084
public_meeting               2
scheme_management           11
scheme_name               2548
permit                       2
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
dtype: int64

In [8]:
one_hot_cat = ['public_meeting',
               'permit',
               'management_group',
               'payment_type',
               'quantity',
               'waterpoint_type_group']

In [9]:
# Arrange data into X features matrix and y target vector 
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [10]:
X_train.shape

(47520, 33)

## Encode and impute missing values

In [11]:
# Encoder: fit_transform on train, transform on val & test
encoder = ce.OneHotEncoder(use_cat_names=True, cols=one_hot_cat)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)
X_test = encoder.transform(X_test)

In [12]:
X_train.shape

(47520, 56)

In [13]:
# ordinal encoding 
X_train_encoded = ce.OrdinalEncoder().fit_transform(X_train)
X_val_encoded = ce.OrdinalEncoder().fit_transform(X_val)
X_test_encoded = ce.OrdinalEncoder().fit_transform(X_test)

In [14]:
# Iterative imputation
imputer = IterativeImputer(n_iter=5, sample_posterior=True)
column_names = X_train_encoded.columns.values.tolist()

X_train_filled = pd.DataFrame(imputer.fit_transform(X_train_encoded), columns=column_names)
X_val_filled = pd.DataFrame(imputer.fit_transform(X_val_encoded), columns=column_names)
X_test_filled = pd.DataFrame(imputer.fit_transform(X_test_encoded), columns=column_names)

In [15]:
X_train_filled.head(1)

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year,year_recorded,...,source,source_type,source_class,waterpoint_type,waterpoint_type_group_communal standpipe,waterpoint_type_group_other,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_cattle trough,waterpoint_type_group_dam
0,0.0,-3595.55,37.22,-6.53,0.0,5.0,1.0,413.0,2005.0,2011.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


## Fit model, make predictions, evaluate

In [16]:
# fit model on training data
model = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=100,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=1,
 colsample_bytree=0.8,
 booster='dart',
 n_jobs=-1,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
model.fit(X_train_filled, y_train)

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=4, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=1)

In [17]:
# make predictions for test data
preds = model.predict(X_val_filled)

In [18]:
# evaluate predictions
accuracy = accuracy_score(y_val, preds)
print(f'Accuracy Score: {accuracy}')

Accuracy Score: 0.7094276094276094


In [19]:
model2 = RandomForestClassifier(n_estimators=800, max_depth=25)
model2.fit(X_train_filled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
preds2 = model2.predict(X_val_filled)

In [21]:
# evaluate predictions
accuracy = accuracy_score(y_val, preds2)
print(f'Accuracy Score: {accuracy}')

Accuracy Score: 0.7319023569023569
