In [2]:
import category_encoders as ce
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
train = pd.merge(pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P'), 
                 pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f'))

test = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'], random_state=42)

In [11]:
def clean(df):
    df = df.copy()
    
    df['date_recorded'] = pd.to_datetime(df['date_recorded'], infer_datetime_format=True)
    
    df['year_recorded'] = df['date_recorded'].dt.year
    df['month_recorded'] = df['date_recorded'].dt.month
    df['day_recorded'] = df['date_recorded'].dt.day
    df = df.drop(columns='date_recorded')
    
    df['years'] = df['year_recorded'] - df['construction_year']
    
    duplicate_columns = ['quantity_group']
    df = df.drop(columns=duplicate_columns)
    
    df['latitude'] = df['latitude'].replace(-2e-08, np.nan)
    
    cols_with_zeros = ['construction_year', 'longitude', 'latitude', 'gps_height',
                      'population']
    for col in cols_with_zeros:
        df[col] = df[col].replace(0, np.nan)
        
    categoricals = df.select_dtypes(exclude='number').columns
    for col in categoricals:
        df[col] = df[col].fillna('MISSING')
        
    return df

In [12]:
train = clean(train)
val = clean(val)
test = clean(test)

In [13]:
target = 'status_group'
train_features = train.drop(columns=[target])
numeric_features = train_features.select_dtypes(include='number').columns.tolist()
cardinality = train_features.select_dtypes(exclude='number').nunique()
categorical_features = cardinality[cardinality <= 150].index.tolist()
features = numeric_features + categorical_features

In [14]:
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

pipeline = make_pipeline(ce.OneHotEncoder(use_cat_names=True),
                        SimpleImputer(strategy='median'),
                        DecisionTreeClassifier(max_depth=20,
                                              random_state=42))

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('onehotencoder', OneHotEncoder(cols=['basin', 'region', 'lga', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', '...        min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'))])

In [17]:
encoder = pipeline.named_steps['onehotencoder']
tree = pipeline.named_steps['decisiontreeclassifier']
feature_names = encoder.transform(X_val).columns
importances = pd.Series(tree.feature_importances_, feature_names)

In [21]:
from sklearn.ensemble import RandomForestClassifier

pipeline = make_pipeline(ce.OneHotEncoder(use_cat_names=True),
                        SimpleImputer(strategy='median'),
                        RandomForestClassifier(n_estimators=100,
                                              n_jobs=-1))

pipeline.fit(X_train, y_train)
print('validation accuracy', pipeline.score(X_val, y_val))

validation accuracy 0.8094276094276094


In [19]:
y_pred = pipeline.predict(X_test)
submission = sample_submission.copy()
submission['status_group'] = y_pred 
submission.to_csv('submission-32.csv', index=False)