In [89]:
%matplotlib inline
import warnings
import kaggle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.exceptions import DataConversionWarning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

In [22]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

In [50]:
# Import data
test_features = pd.read_csv('test_features.csv')
train_features = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')

# assign to train, test, and submission
X_train = train_features.drop(columns='id')
X_test = test_features.drop(columns='id')
y_train = train_labels.drop(columns='id')
submission = test_features[['id']]

X_train.shape, X_test.shape, y_train.shape

((59400, 39), (14358, 39), (59400, 1))

In [5]:
def submission_csv(y_pred, file_name):
    """
    Function to create csv file to be submitted to Kaggle.com
    
    Parameters
    ----------
    y_pred: a 1-D array of model predictions
    file_name: name for new csv file as a string
    
    Returns
    -------
    A csv file in the current directory with a column for label id and predicted labels.
    """
    y_pred = y_pred.copy()
    submission['status_group'] = y_pred
    submission.to_csv(file_name, index=False)

In [None]:
# Example submission code
# submit predictions to kaggle
#!kaggle competitions submit -c ds3-predictive-modeling-challenge -f kaggle-submission-004.csv -m "Kitchen sink model with hyper-parameter

### Data Cleaning

In [51]:
numeric = X_train.select_dtypes(include='number').columns.tolist()
categorical = X_train.select_dtypes(exclude='number').columns.tolist()

print(numeric)
print('\n', categorical)

['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year']

 ['date_recorded', 'funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group']


In [53]:
X_train['installer'].value_counts()

DWE                               17402
Government                         1825
RWE                                1206
Commu                              1060
DANIDA                             1050
KKKT                                898
Hesawa                              840
0                                   777
TCRS                                707
Central government                  622
CES                                 610
Community                           553
DANID                               552
District Council                    551
HESAWA                              539
LGA                                 408
World vision                        408
WEDECO                              397
TASAF                               396
District council                    392
Gover                               383
AMREF                               329
TWESA                               316
WU                                  301
Dmdd                                287


In [70]:
def wrangle(X):
    # Create copy of dataframe to avoid copy warning
    X = X.copy()
    
    # Convert date to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract datetime data
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X['day_of_week_recorded'] = X['date_recorded'].dt.weekday_name
    
    # Remove datetime column to avoid error when fitting data to model
    X = X.drop(columns='date_recorded')
    
    # Drop duplicate or unnecessary features
    X = X.drop(columns=['recorded_by', 'quantity_group'])
    
    # Several categorical features have values showing as '0'
    # Replace '0' with 'other'
    categoricals = X.select_dtypes(exclude='number').columns.tolist()
    X[categoricals] = X[categoricals].replace('0', 'other')
    
    # Convert geographical codes from numeric to categorical
    X['region_code'] = X['region_code'].astype('category')
    X['district_code'] = X['district_code'].astype('category')
    
    # Binning high cardinality features
    # Any feature values with fewer than 100 rows gets turned into nan
    categoricals = X.select_dtypes(exclude='number').columns.tolist()
    for feature in X[categoricals]:
        to_keep = X[feature].value_counts()[X[feature].value_counts() > 100].index.tolist()
        feature_copy = X[feature].copy()
        feature_copy[~feature_copy.isin(to_keep)] = np.nan
        X[feature] = feature_copy
        
    # Replace all nan values with 'other'
    X = X.replace(np.nan, 'other')
    
    return X

In [71]:
X_train_clean = wrangle(X_train)
X_test_clean = wrangle(X_test)

X_train.shape, X_test.shape

((59400, 39), (14358, 39))

### Feature engineering

In [78]:
def feature_eng(X):
    X = X.copy()
    
    # Engineered features from data exploration and cleaning notebook
    X['construction_year_missing'] = X['construction_year'] == 0
    X['payment_equal_to_type'] = X['payment_type'] == X['payment']
    X['water_quality_equal_to_group'] = X['water_quality'] == X['quality_group']
    X['abandoned_well'] = (X['water_quality'] == 'salty abandoned') | (X['water_quality'] == 'fluoride abandoned')
    X['amount_tsh_zero'] =  X['amount_tsh'] <= 0
    X['amount_tsh_above_3000'] = X['amount_tsh'] > 3000
    X['water_quality_good_and_seasonal'] = (X['quality_group'] == 'good') & (X['quantity'] == 'seasonal')
    X['water_quality_good_and_dry'] = (X['quality_group'] == 'good') & (X['quantity'] == 'dry')
    X['water_quality_good_and_insufficient'] = (X['quality_group'] == 'good') & (X['quantity'] == 'insufficient')
    X['water_quality_good_and_enough'] = (X['quality_group'] == 'good') & (X['quantity'] == 'enough')
    X['water_quality_good'] =  X['quality_group'] == 'good'
    
    #Since some years show as 0, I'll have to manually clean up the new features
    X['age'] = 2019 - X['construction_year']
    X['age'] = X['age'].replace(2019, 0)
    X['years_since_inspection'] = 2019 - X['year_recorded']
    X['years_since_inspection'] = X['years_since_inspection'].replace(2019, 0)
    X['age_at_inspection'] = X['year_recorded'] - X['construction_year']
    X['age_at_inspection'] = X['age_at_inspection'].replace({2011: 0, 2012: 0, 2013: 0, 2004: 0, 2002: 0, 
                                                         -1: 0, -2: 0, -3: 0, -4: 0, -5: 0, -7: 0})
    
    # Dealing with longitude and latitude
    mean_lat = X['latitude'].mean()
    mean_long = X['longitude'].mean()
    X['distance_2d'] = np.sqrt((X['longitude'] - mean_long)**2 + (X['latitude'] - mean_lat)**2)
    X['distance_3d'] = np.sqrt((X['gps_height']**2 + X['longitude'] - mean_long)**2 + (X['latitude'] - mean_lat)**2)
    
    return X

In [79]:
X_train_eng = feature_eng(X_train_clean)
X_test_eng = feature_eng(X_test_clean)

X_train_eng.shape, X_test_eng.shape

((59400, 56), (14358, 56))

### Feature scaling and encoding

In [86]:
scaler = RobustScaler()
encoder = ce.OrdinalEncoder()

X_train_encoded = encoder.fit_transform(X_train_eng)
X_test_encoded = encoder.transform(X_test_eng)
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

### Fit data to model from Day 1 notebook

In [None]:
# Set parameters
param_dist = {
    'n_estimators': randint(50, 300),
    'max_features': randint(2, 56)
}

# Create randomized search
search = RandomizedSearchCV(
    estimator=RandomForestClassifier(max_depth=None, n_jobs=-1, random_state=42),
    param_distributions=param_dist,
    n_iter=100,
    scoring='accuracy',
    n_jobs=-1,
    cv=10,
    verbose=10,
    return_train_score=True,
    random_state=42
)

# fit the model
search.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 40.8min
