In [1]:
%matplotlib inline
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
# Import data
test_features = pd.read_csv('test_features.csv')
train_features = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')

# assign to train, test, and submission
X_train = train_features.drop(columns='id')
X_test = test_features.drop(columns='id')
y_train = train_labels.drop(columns='id')
submission = test_features[['id']]

X_train.shape, X_test.shape, y_train.shape

((59400, 39), (14358, 39), (59400, 1))

In [4]:
# Split training data into train, validate. Make the validation set the same shape of the test set.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=len(X_test), stratify=y_train, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((45042, 39), (14358, 39), (45042, 1), (14358, 1))

### Data cleaning, feature engineering, and categorical encoding

In [5]:
def wrangle(X):
    # Create copy of dataframe to avoid copy warning
    X = X.copy()
    
    # Some features have missing data showing as 0 that need to be changed to nan
    X['gps_height'] = X['gps_height'].replace(0.0, np.nan)
    X['longitude'] = X['longitude'].replace(0.0, np.nan)
    X['latitude'] = X['latitude'].replace(0.0, np.nan)
    X['construction_year'] = X['construction_year'].replace(0.0, np.nan)
    X['population'] = X['population'].replace(0.0, np.nan)
    X['amount_tsh'] = X['amount_tsh'].replace(0.0, np.nan)
    
    # gps_height: replace nan values with the mean of the smallest geographical region possible
    # Excluding subvillage due to missing values
    X['gps_height'].fillna(X.groupby(['ward'])['gps_height'].transform('mean'), inplace=True)
    X['gps_height'].fillna(X.groupby(['district_code'])['gps_height'].transform('mean'), inplace=True)
    X['gps_height'].fillna(X.groupby(['region_code'])['gps_height'].transform('mean'), inplace=True)
    X['gps_height'].fillna(X['gps_height'].mean(), inplace=True)
    
    # longitude: replace nan values with the mean of the smallest geographical region possible
    # Excluding subvillage due to missing values
    X['longitude'].fillna(X.groupby(['ward'])['longitude'].transform('mean'), inplace=True)
    X['longitude'].fillna(X.groupby(['district_code'])['longitude'].transform('mean'), inplace=True)
    X['longitude'].fillna(X.groupby(['region_code'])['longitude'].transform('mean'), inplace=True)
    X['longitude'].fillna(X['longitude'].mean(), inplace=True)
    
    # latitude: replace nan values with the mean of the smallest geographical region possible
    # Excluding subvillage due to missing values
    X['latitude'].fillna(X.groupby(['ward'])['latitude'].transform('mean'), inplace=True)
    X['latitude'].fillna(X.groupby(['district_code'])['latitude'].transform('mean'), inplace=True)
    X['latitude'].fillna(X.groupby(['region_code'])['latitude'].transform('mean'), inplace=True)
    X['latitude'].fillna(X['latitude'].mean(), inplace=True)
    
    # population: replace nan values with the mean of the smallest geographical region possible
    # Excluding subvillage due to missing values
    X['population'].fillna(X.groupby(['ward'])['population'].transform('median'), inplace=True)
    X['population'].fillna(X.groupby(['district_code'])['population'].transform('median'), inplace=True)
    X['population'].fillna(X.groupby(['region_code'])['population'].transform('median'), inplace=True)
    X['population'].fillna(X['population'].median(), inplace=True)
    
    # population: replace nan values with the mean of the smallest geographical region possible
    # Excluding subvillage due to missing values
    X['amount_tsh'].fillna(X.groupby(['ward'])['amount_tsh'].transform('median'), inplace=True)
    X['amount_tsh'].fillna(X.groupby(['district_code'])['amount_tsh'].transform('median'), inplace=True)
    X['amount_tsh'].fillna(X.groupby(['region_code'])['amount_tsh'].transform('median'), inplace=True)
    X['amount_tsh'].fillna(X['amount_tsh'].median(), inplace=True)
    
    # construction_year: replace nan values with the mean of the smallest geographical region possible
    # Excluding subvillage due to missing values
    X['construction_year'].fillna(X.groupby(['ward'])['construction_year'].transform('median'), inplace=True)
    X['construction_year'].fillna(X.groupby(['district_code'])['construction_year'].transform('median'), inplace=True)
    X['construction_year'].fillna(X.groupby(['region_code'])['construction_year'].transform('median'), inplace=True)
    X['construction_year'].fillna(X['construction_year'].median(), inplace=True)
    
    # Convert date to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract datetime data
    X['year_recorded'] = X['date_recorded'].dt.year
    
    # Drop duplicate or unnecessary features
    X = X.drop(columns=['recorded_by', 'quantity_group', 'date_recorded', 'wpt_name', 'num_private', 'subvillage',
                       'region_code', 'management_group', 'extraction_type_group', 'extraction_type_class',
                       'scheme_name', 'payment', 'water_quality', 'source_type', 'source_class', 'waterpoint_type_group', 
                        'public_meeting', 'permit'])
    
    # Several categorical features have values showing as '0'
    # Replace '0' with nan
    categoricals = X.select_dtypes(exclude='number').columns.tolist()
    X[categoricals] = X[categoricals].replace('0', np.nan)
    
    # Convert to lowercase to collapse duplicates
    X['waterpoint_type'] = X['waterpoint_type'].str.lower()
    X['funder'] = X['funder'].str.lower()
    X['basin'] = X['basin'].str.lower()
    X['region'] = X['region'].str.lower()
    X['source'] = X['source'].str.lower()
    X['lga'] = X['lga'].str.lower()
    X['management'] = X['management'].str.lower()
    X['quantity'] = X['quantity'].str.lower()
    X['quality_group'] = X['quality_group'].str.lower()
    X['payment_type'] = X['payment_type'].str.lower()
    X['extraction_type'] = X['extraction_type'].str.lower()
    
    # Replace nan values with 'other'
    X["funder"].fillna("other", inplace=True)
    X["scheme_management"].fillna("other", inplace=True)
    X["installer"].fillna("other", inplace=True)
    
    X = X.replace(np.nan, 'other')
    
    return X

In [6]:
def feature_eng(X):
    X = X.copy()
    
    # Engineered features from data exploration and cleaning notebook
    X['amount_tsh_above_1000'] = X['amount_tsh'] > 1000
    X['water_quality_good_and_seasonal'] = (X['quality_group'] == 'good') & (X['quantity'] == 'seasonal')
    X['water_quality_good_and_dry'] = (X['quality_group'] == 'good') & (X['quantity'] == 'dry')
    X['water_quality_good_and_insufficient'] = (X['quality_group'] == 'good') & (X['quantity'] == 'insufficient')
    X['water_quality_good_and_enough'] = (X['quality_group'] == 'good') & (X['quantity'] == 'enough')
    X['water_quality_good'] = X['quality_group'] == 'good'
    X['age'] = 2019 - X['construction_year']
    X['years_since_inspection'] = 2019 - X['year_recorded']
    X['x_coordinate'] = np.cos(X['latitude'] * np.cos(X['longitude']))
    X['y_coordinate'] = np.cos(X['latitude'] * np.sin(X['longitude']))
    X['z_coordinate'] = np.sin(X['latitude'])
    
    return X

In [7]:
def encode(X_train, X_val, X_test):
    X_train = X_train.copy()
    X_val = X_val.copy()
    X_test = X_test.copy()
    encoder = ce.OrdinalEncoder()
    X_train = encoder.fit_transform(X_train)
    X_val = encoder.transform(X_val)
    X_test = encoder.transform(X_test)
    
    return X_train, X_val, X_test

In [8]:
# run wrangle function
X_train = wrangle(X_train)
X_val = wrangle(X_val)
X_test = wrangle(X_test)

X_train.shape, X_val.shape, X_test.shape

((45042, 22), (14358, 22), (14358, 22))

In [9]:
# run feature_eng function
X_train = feature_eng(X_train)
X_val = feature_eng(X_val)
X_test = feature_eng(X_test)

X_train.shape, X_val.shape, X_test.shape

((45042, 33), (14358, 33), (14358, 33))

In [10]:
# run encode function
X_train, X_val, X_test = encode(X_train, X_val, X_test)

X_train.shape, X_val.shape, X_test.shape

((45042, 33), (14358, 33), (14358, 33))

### Helper functions

In [11]:
def submission_csv(y_pred, file_name):
    """
    Function to create csv file to be submitted to Kaggle.com
    
    Parameters
    ----------
    y_pred: a 1-D array of model predictions
    file_name: name for new csv file as a string
    
    Returns
    -------
    A csv file in the current directory with a column for label id and predicted labels.
    """
    y_pred = y_pred.copy()
    submission['status_group'] = y_pred
    submission.to_csv(file_name, index=False)

In [12]:
# Example submission code
# submit predictions to kaggle
#!kaggle competitions submit -c ds3-predictive-modeling-challenge -f kaggle-submission-004.csv -m "Kitchen sink model with hyper-parameter

### Run models

In [16]:
# RandomForestClassifier

# Set model params
model = RandomForestClassifier(criterion='entropy', max_features=4, n_estimators=251, n_jobs=-1, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_val)

# Check score
score = accuracy_score(y_val, y_pred)
score

0.8090959743696894

In [17]:
# Make predictions for kaggle
y_pred = model.predict(X_test)

# Make prediction csv file
submission_csv(y_pred, 'stacking-001.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [12]:
!kaggle competitions submit -c ds3-predictive-modeling-challenge -f stacking-001.csv -m "RFC with new features"

Successfully submitted to DS3 Predictive Modeling Challenge



  0%|          | 0.00/279k [00:00<?, ?B/s]
  3%|2         | 8.00k/279k [00:00<00:14, 19.2kB/s]
 32%|###1      | 88.0k/279k [00:00<00:07, 27.2kB/s]
 40%|####      | 112k/279k [00:00<00:04, 36.3kB/s] 
 77%|#######7  | 216k/279k [00:00<00:01, 51.0kB/s]
100%|##########| 279k/279k [00:04<00:00, 66.1kB/s]


In [12]:
# BaggingClassifier with 1000 DecisionTreeClassifier

# Set model parameters
tree = BaggingClassifier(n_estimators=1000, max_features=4, n_jobs=-3, random_state=42, verbose=10)

# Fit the model
tree.fit(X_train, y_train)

# Make predictions
y_pred = tree.predict(X_val)

# Check score
score = accuracy_score(y_val, y_pred)
score

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  1.4min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   21.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   21.7s finished


0.7775456191670149

In [20]:
# Make predictions for kaggle
y_pred = model.predict(X_test)

# Make prediction csv file
submission_csv(y_pred, 'stacking-002.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [14]:
# BaggingClassifier with 2,000 DecisionTreeClassifier

# Set model parameters
tree = BaggingClassifier(n_estimators=2000, max_features=4, n_jobs=-3, random_state=42, verbose=10)

# Fit the model
tree.fit(X_train, y_train)

# Make predictions for kaggle
y_pred = tree.predict(X_test)

# Make prediction csv file
submission_csv(y_pred, 'stacking-003.csv')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  3.0min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  3.0min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   53.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   53.0s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [15]:
# BaggingClassifier with 3,000 DecisionTreeClassifier

# Set model parameters
tree = BaggingClassifier(n_estimators=3000, max_features=3, n_jobs=-3, random_state=42, verbose=3)

# Fit the model
tree.fit(X_train, y_train)

# Make predictions for kaggle
y_pred = tree.predict(X_test)

# Make prediction csv file
submission_csv(y_pred, 'stacking-004.csv')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  3.8min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  3.8min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  1.2min finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [12]:
# BaggingClassifier with 2,000 DecisionTreeClassifier

# Set model parameters
tree = BaggingClassifier(n_estimators=2000, max_features=6, n_jobs=-3, random_state=42, verbose=10)

# Fit the model
tree.fit(X_train, y_train)

# Make predictions for kaggle
y_pred = tree.predict(X_test)

# Make prediction csv file
submission_csv(y_pred, 'stacking-005.csv')

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  4.5min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  4.5min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  1.2min finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [13]:
# RandomForestClassifier 1000 n_estimators

# Set model params
model = RandomForestClassifier(criterion='entropy', max_features=5, n_estimators=1000, n_jobs=-3, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions for kaggle
y_pred = model.predict(X_test)

# Make prediction csv file
submission_csv(y_pred, 'stacking-006.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [15]:
# XGBClassifier
model = XGBClassifier(max_depth=5, n_estimators=1000, n_jobs=-3, random_state=42)
model.fit(X_train, y_train)

# Make predictions for kaggle
y_pred = model.predict(X_test)

# Make prediction csv file
submission_csv(y_pred, 'stacking-007.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [17]:
#Filenames of your submissions you want to ensemble
files = ['stacking-001.csv', 'stacking-002.csv', 'stacking-003.csv', 'stacking-004.csv', 'stacking-005.csv',
        'stacking-006.csv','stacking-007.csv']

submissions = (pd.read_csv(file)[['status_group']] for file in files)
ensemble = pd.concat(submissions, axis='columns')
majority_vote = ensemble.mode(axis='columns')[0]

sample_submission = pd.read_csv('sample_submission.csv')
submission = sample_submission.copy()
submission['status_group'] = majority_vote
submission.to_csv('my-ultimate-ensemble-submission1.csv', index=False)

In [18]:
!kaggle competitions submit -c ds3-predictive-modeling-challenge -f my-ultimate-ensemble-submission1.csv -m "2nd attempt at stacking"

Successfully submitted to DS3 Predictive Modeling Challenge



  0%|          | 0.00/275k [00:00<?, ?B/s]
  3%|2         | 8.00k/275k [00:00<00:11, 23.5kB/s]
 29%|##9       | 80.0k/275k [00:00<00:06, 33.0kB/s]
 38%|###7      | 104k/275k [00:00<00:03, 44.3kB/s] 
 46%|####6     | 128k/275k [00:00<00:02, 58.6kB/s]
100%|##########| 275k/275k [00:05<00:00, 53.8kB/s]


In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

param_distribution = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(1, 10)
}

search = RandomizedSearchCV(
    estimator=XGBClassifier(n_jobs=-3, random_state=42),
    param_distributions=param_distribution,
    n_iter=20, 
    scoring='accuracy',
    n_jobs=-1,
    cv=2,
    verbose=10,
    return_train_score=True,
    random_state=42
)

search.fit(X_train, y_train)

Fitting 2 folds for each of 20 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 21.2min


In [None]:
search.best_score_

In [None]:
search.best_estimator_

In [None]:
# Create predictions
best = search.best_estimator_
y_pred = best.predict(X_test)

# Create submission csv file
submission_csv(y_pred, 'kaggle-submission-009.csv')

# Submit to kaggle
!kaggle competitions submit -c ds3-predictive-modeling-challenge -f kaggle-submission-009.csv -m "XGB final"