In [1]:
import category_encoders as ce
from fancyimpute import IterativeImputer, KNN
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
pd.set_option('display.float_format', '{:.2f}'.format)

Using TensorFlow backend.


In [5]:
import numpy as np

In [2]:
# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P'), 
                 pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

# Split train into train & val
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'], random_state=42)

train.shape, val.shape, test.shape, sample_submission.shape

((47520, 41), (11880, 41), (14358, 40), (14358, 2))

## Data wrangle function

In [3]:
def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()

    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer feature: how many years from construction_year to date_recorded
    X['years'] = X['year_recorded'] - X['construction_year']    
    
    # Drop recorded_by (never varies) and id (always varies, random)
    X = X.drop(columns=['recorded_by', 'id'])
    
    # Drop duplicate columns
    duplicate_columns = ['quantity_group']
    X = X.drop(columns=duplicate_columns)
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these like null values
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
    # When columns have zeros and shouldn't, they are like null values
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        
    # For categoricals with missing values, fill with the category 'MISSING'
#     categoricals = X.select_dtypes(exclude='number').columns
#     for col in categoricals:
#         X[col] = X[col].fillna('MISSING')
    
    return X

In [6]:
train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

## Select data and arrange data into features matrix and target vector

In [7]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

In [8]:
# Arrange data into X features matrix and y target vector 
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [8]:
pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    IterativeImputer(),
    RandomForestClassifier(n_estimators=800, max_depth=25)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
print(f'Validation Score: {pipeline.score(X_val, y_val)}')

Validation Score: 0.8122053872053872


In [None]:
# this is taking so long to run. did I do something wrong or is my computer just lagging still?
knn_pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    KNN(),
    RandomForestClassifier(n_estimators=800, max_depth=25)
)

# Fit on train, score on val
knn_pipeline.fit(X_train, y_train)
print(f'Validation Score: {knn_pipeline.score(X_val, y_val)}')