In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from xgboost import XGBClassifier, XGBRFClassifier
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.utils import shuffle

In [2]:
test_features = pd.read_csv('test_features.csv')
train_features = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')
sample_submission = pd.read_csv('sample_submission.csv')


In [3]:
train_features['date_recorded'] = pd.to_datetime(train_features['date_recorded'], infer_datetime_format=True)
test_features['date_recorded'] = pd.to_datetime(test_features['date_recorded'], infer_datetime_format=True)
train_features['water_per_pop'] = train_features['amount_tsh']/train_features['population']
test_features['water_per_pop'] = test_features['amount_tsh']/test_features['population']  
train_features['water_per_pop'] = train_features['water_per_pop'].replace([np.inf, -np.inf], np.nan)
test_features['water_per_pop'] = test_features['water_per_pop'].replace([np.inf, -np.inf], np.nan)

In [4]:
def feature_engineering(X):
    X['construction_year'] = X['construction_year'].replace(np.nan, 2000)
    X['construction'] = X['construction_year'] != 0
    X['water_per_pop'] = X['water_per_pop'].replace(np.nan, 0)
    return X
train_features = feature_engineering(train_features)
test_features = feature_engineering(test_features)


In [5]:
def feature_engineering2(X):   
    X['year'] = X['date_recorded'].dt.year
    X['month'] = X['date_recorded'].dt.month
    X['week'] = X['date_recorded'].dt.week
    X['age'] = X['year'] -X['construction_year']
    X['age'].loc[X['age'] == X['year']] = 0
    X['date_recorded'] = X['date_recorded'].astype(str)
    return X

train_features = feature_engineering2(train_features)
test_features = feature_engineering2(test_features)

In [6]:
def fill_nan(X):
    X['public_meeting'] = X['public_meeting'].fillna(lambda x: random.choice(X[X['public_meeting'] != np.nan])['public_meeting'])
    X['permit'] = X['permit'].fillna(lambda x: random.choice(X[X['permit'] != np.nan])['permit'])
    X['age'] = X['age'].replace(0, round(X['age'].mean()))
    X['gps_height'] = X['gps_height'].replace(0, round(X['gps_height'].median()))
    X['source_type'] = X['source_type'].fillna(lambda x: random.choice(X[X['source_type'] != 'other'])['source_type'])
    X['scheme_management'] = X['scheme_management'].replace(np.nan,0)
    X['scheme_management'] = X['scheme_management'].replace(0,'Unknown')
    X['funder'] = X['funder'].fillna(lambda x: random.choice(X[X['funder'] != np.nan])['funder'])
    X['installer'].replace(np.nan,0, inplace= True)
    X['installer'] = X['installer'].replace('-', "Other")
    X['installer'] = X['installer'].replace(0, "Other")
    X['date_recorded'] = X['date_recorded'].astype(str)
    X['construction']= X['construction'].astype('str')
    X['population']= X['population'].astype('float64')      
    X['gps_height']= X['gps_height'].astype('float64')
    return X
train_features = fill_nan(train_features)
test_features =fill_nan(test_features)

In [7]:
def clean_col(X):
    organize= {'india mark ii': 'india',
    'india mark iii': 'india',
    'other - swn 81': 'swn',
    'swn 80': 'swn',
    'other - play pump': 'other handpump',
    'walimi': 'other handpump',
    'other - mkulima/shinyanga' : 'other handpump',
    'cemo': 'other motorpump',
    'climax': 'other motorpump'}
    X['extraction_type']= X['extraction_type'].replace(organize)

    
    return X
train_features = clean_col(train_features)
test_features = clean_col(test_features)

In [8]:
def drop_col(X):
    drop_cols = ['wpt_name',    # too many levels 
    'subvillage',  # too many levels; we have lat and long for location
    'ward',        # too many levels; we have lat and long for location
    'recorded_by', # constant
    'scheme_name', # too many levels
    'num_private', # irrelevant
    'region_code', # too many levels; we have lat and long for location
    'quantity_group', #same as quantity column
    'source_type',   #same as source but with fewer levels
    'waterpoint_type_group', #same as waterpoint
    'payment_type']          #same as payment
    X = X.drop(columns= drop_cols)
    return X
train_features = drop_col(train_features)
test_features = drop_col(test_features)
train_labels = train_labels.drop(columns='id')


In [9]:
train_features['installer'].isna().sum()

0

In [10]:
train_features.dtypes

id                         int64
amount_tsh               float64
date_recorded             object
funder                    object
gps_height               float64
installer                 object
longitude                float64
latitude                 float64
basin                     object
region                    object
district_code              int64
lga                       object
population               float64
public_meeting            object
scheme_management         object
permit                    object
construction_year          int64
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
water_quality             object
quality_group             object
quantity                  object
source                    object
source_class              object
waterpoint_type           object
water_per_pop            float64
constructi

In [11]:
cat_cols=train_features.select_dtypes(include=['object']).columns.values.tolist()
cat_cols

['date_recorded',
 'funder',
 'installer',
 'basin',
 'region',
 'lga',
 'public_meeting',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'water_quality',
 'quality_group',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'construction']

In [12]:
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, random_state=42, test_size=.2)
X_test= test_features

In [13]:
# Encode categorical features
encoder = ce.OrdinalEncoder()

# Fit & Transform
X_train = encoder.fit_transform(X_train)

# Scale - Only the continuous. I found scaling all features made prediction worse

continuous_col = ['population','gps_height','week','month','year','age','longitude','latitude'] 

scaled = MinMaxScaler()
X_train[continuous_col] = scaled.fit_transform(X_train[continuous_col])


In [14]:
 #Fit X_val
X_val = encoder.transform(X_val)

# Partial scale x_val
X_val[continuous_col] = scaled.fit_transform(X_val[continuous_col])

In [15]:
# Define parameters for the model

    


# Define the model and input inside RSCV
model = 
# RSCV
search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    scoring='accuracy',
    n_iter=9,
    n_jobs=-1,
    cv=5,
    verbose=5,
    return_train_score=True,
    random_state=42
)

# Fit to training data
search.fit(X_train, y_train)
print('Training Accuracy Score:', search.best_score_) 

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  8.5min remaining: 12.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.5min finished
Training Accuracy Score: 0.8039772727272727


In [16]:
best = search.best_estimator_
y_pred = best.predict(X_val)
print('Validation Set Accuracy Score:', accuracy_score(y_val, y_pred))

Validation Set Accuracy Score: 0.8004208754208754


In [17]:
best = search.best_estimator_
X_test = encoder.transform(X_test)

# Partial scale x_test
X_test[continuous_col] = scaled.fit_transform(X_test[continuous_col])
y_pred_test = best.predict(X_test)

In [18]:
submission = sample_submission.copy()
submission['status_group'] = y_pred_test
submission.to_csv('henry.csv', index=False)