In [11]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, Imputer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

In [39]:
train = pd.merge(pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/train_features.csv')
                ,pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/train_labels.csv'))
test = pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/test_features.csv')

In [48]:
sample_submission = pd.read_csv('https://raw.githubusercontent.com/WillHK/DS-Unit-2-Classification-1/master/module1-logistic-regression/sample_submission.csv')

In [20]:
train.shape, test.shape

((59400, 41), (14358, 40))

In [40]:
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'], random_state=42)


def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()

    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer feature: how many years from construction_year to date_recorded
    X['years'] = X['year_recorded'] - X['construction_year']    
    
    # Drop recorded_by (never varies) and id (always varies, random)
    X = X.drop(columns=['recorded_by', 'id'])
    
    # Drop duplicate columns
    duplicate_columns = ['quantity_group', 'management', 'scheme_management', 'extraction_type', 'extraction_type_group', 'waterpoint_type']
    X = X.drop(columns=duplicate_columns)
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these like null values
    X['latitude'] = X['latitude'].replace(-2e-08, np.nan)
    
    # When columns have zeros and shouldn't, they are like null values
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        
    # For categoricals with missing values, fill with the category 'MISSING'
    categoricals = X.select_dtypes(exclude='number').columns
    for col in categoricals:
        X[col] = X[col].fillna('MISSING')
    
    return X


train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [52]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()
cardinal_features = cardinality[cardinality > 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

In [42]:
cardinality.sort_values(ascending=False)

wpt_name                 30661
subvillage               17232
scheme_name               2564
ward                      2082
installer                 1930
funder                    1717
lga                        124
region                      21
source                      10
basin                        9
water_quality                8
source_type                  7
payment_type                 7
payment                      7
extraction_type_class        7
waterpoint_type_group        6
quality_group                6
management_group             5
quantity                     5
public_meeting               3
source_class                 3
permit                       3
dtype: int64

In [46]:
# Arrange data into X features matrix and y target vector 
X_train = train_features
y_train = train[target]
X_val = val.drop(columns=[target])
y_val = val[target]
X_test = test

In [None]:
pipeline = make_pipeline(
    ce.OneHotEncoder(categorical_features),
    ce.OrdinalEncoder(cardinal_features),
    SimpleImputer(strategy='median'),
    RandomForestClassifier(n_estimators=800, max_depth=40, n_jobs=-1)
)
pipeline.fit(X_train, y_train)
print(pipeline.score(X_val, y_val))

In [49]:
y_pred = pipeline.predict(X_test)

# Write submission csv file
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-07.csv', index=False)