In [1]:
%matplotlib inline
from fancyimpute import IterativeImputer
import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
pd.set_option('display.float_format', '{:.2f}'.format)

# Read train_feature.csv, train_labels.csv, test_features.csv, sample_submission.csv
train_features = pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P')
train_labels = pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f')
test_features = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

# Print dataframe shapes
print('train_features', train_features.shape)
print('train_labels', train_labels.shape)
print('test_features', test_features.shape)
print('sample_submission', sample_submission.shape)

Using TensorFlow backend.


train_features (59400, 40)
train_labels (59400, 2)
test_features (14358, 40)
sample_submission (14358, 2)


In [2]:
# train/test split
X_train, X_val, y_train, y_val = train_test_split(
    train_features, train_labels['status_group'],
    train_size=0.8, test_size=0.2, 
    stratify=train_labels['status_group'], 
    random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

## Clean data and impute missing values

In [3]:
train_features.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.13,317.65,668.3,34.08,-5.71,0.47,15.3,5.63,179.91,1300.65
std,21453.13,2997.57,693.12,6.57,2.95,12.24,17.59,9.63,471.48,951.62
min,0.0,0.0,-90.0,0.0,-11.65,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.09,-8.54,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.91,-5.02,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.18,-3.33,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.35,-0.0,1776.0,99.0,80.0,30500.0,2013.0


In [4]:
train_features['latitude'].max()

-2e-08

In [5]:
train_features['longitude'].min()

0.0

In [6]:
# modified wrangle() so that it doesn't fill NaN values
def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these values like zero.
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # When columns have zeros and shouldn't, they are like null values.
    # So we'll make them NaN values and impute them later.
    cols_with_zeros = ['construction_year', 'longitude', 'latitude']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        
    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract year from date_recorded
    X['year_recorded'] = X['date_recorded'].dt.year
    
    # quantity & quantity_group are duplicates, so drop one
    X = X.drop(columns=['quantity_group','date_recorded'])
    
    # for categoricals with missing values, fill with the category 'MISSING'
    categoricals = X.select_dtypes(exclude='number').columns
    for col in categoricals:
        X[col] = X[col].fillna('MISSING')
    
    return X

In [7]:
# apply the wrangle function
X_train = wrangle(X_train)
X_val = wrangle(X_val)
test_features = wrangle(test_features)

## Select features

In [8]:
# get dataframe with all train columns except id and date_recorded
X_train = X_train.drop(columns='id')

# get a list of numeric features
numeric_features = X_train.select_dtypes(include='number').columns.tolist()

# get a series with the cardinality of the nonnumeric features
cardinality = X_train.select_dtypes(exclude='number').nunique()

# get list of categorical features with cardinality < 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# combine the lists
features = numeric_features + categorical_features

In [9]:
X_train = X_train[features]
X_val = X_val[features]
test_features = test_features[features]

## One-hot encoding

In [10]:
# encoder: fit transform on train, transform on val and test
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)
test_features_encoded = encoder.transform(test_features)

In [12]:
# impute missing values with IterativeImputer
encoded_columns = X_train_encoded.columns.tolist()
X_train = pd.DataFrame(IterativeImputer().fit_transform(X_train_encoded), columns=encoded_columns)
X_val = pd.DataFrame(IterativeImputer().fit_transform(X_val_encoded), columns=encoded_columns)
test_features = pd.DataFrame(IterativeImputer().fit_transform(test_features_encoded), columns=encoded_columns)

## Models

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
print('Decision Tree')
print('---'*10)
print(f'Train Accuracy: {dt_model.score(X_train, y_train)}')
print(f'Validation Accuracy: {dt_model.score(X_val, y_val)}')

Decision Tree
------------------------------
Train Accuracy: 0.9959595959595959
Validation Accuracy: 0.731986531986532


In [16]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [17]:
# so far this seems like my best result
model = RandomForestClassifier()
model.fit(X_train, y_train)
print('Validation Accuracy', model.score(X_val, y_val))



Validation Accuracy 0.7963804713804714


In [18]:
# # Write submission csv file
# submission = sample_submission.copy()
# submission['status_group'] = model.predict(test_features)
# submission.to_csv('submission-11.csv', index=False)

In [None]:
# lets try hyperparameter tuning on Random Forest
# then try XG Boost

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [22]:
# what about gradient boosting, n_estimators=1000
# I guess I'll take a walk
gb_model = GradientBoostingClassifier(n_estimators=1000)
gb_model.fit(X_train, y_train)
print('Validation Accuracy', gb_model.score(X_val, y_val))

Validation Accuracy 0.7851851851851852


In [23]:
gb_model = GradientBoostingClassifier(n_estimators=10000)
gb_model.fit(X_train, y_train)
print('Validation Accuracy', gb_model.score(X_val, y_val))

Validation Accuracy 0.7854377104377105


In [32]:
# so far this seems like my best result
model = RandomForestClassifier(n_estimators=1000, max_depth=25)
model.fit(X_train, y_train)
print('Validation Accuracy', model.score(X_val, y_val))

Validation Accuracy 0.8071548821548822


In [33]:
# # Write submission csv file
# submission = sample_submission.copy()
# submission['status_group'] = model.predict(test_features)
# submission.to_csv('submission-30.csv', index=False)

In [47]:
# so far this seems like my best result
model = RandomForestClassifier(n_estimators=1000, criterion='entropy')
model.fit(X_train, y_train)
print('Validation Accuracy', model.score(X_val, y_val))

Validation Accuracy 0.806986531986532


In [46]:
# # Write submission csv file
# submission = sample_submission.copy()
# submission['status_group'] = model.predict(test_features)
# submission.to_csv('submission-32.csv', index=False)