# Stacking

Julian Domingo - jad5348

In [107]:
# Data analysis 
import pandas as pd
import numpy as np

# Modeling stuff
from xgboost import XGBClassifier
from mlens.ensemble import SuperLearner
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# For reproducibility
seed = 42
np.random.seed(seed)

# Constants
n_splits = 5

In [59]:
test_ids = pd.read_csv("./data/raw/test.csv")[["id"]]
y_train = pd.read_csv("./data/raw/train.csv")["Y"].ravel()

#### Helper Functions

In [60]:
def write_preds(preds, preds_filename):
    submission = pd.DataFrame({"id": test_ids.id, "Y": preds})
    submission.to_csv("./submissions/{}.csv".format(preds_filename), index=False, columns=["id", "Y"])

#### Stacking Class

Stacking works well for small to medium-sized data sets.

In [102]:
class Stacker(object):
    def __init__(self, base_learners, meta_learners, y_train, test_ids):
        self.base_learners = base_learners
        self.meta_learners = meta_learners
        self.y_train = y_train
        self.test_ids = test_ids
        
        
    def get_meta_features(self):
        """ Retrieves all meta features for the train & test data from the base learners specified. """
        self.meta_features_train = np.zeros((len(self.y_train), len(self.base_learners)))
        self.meta_features_test = np.zeros((len(self.test_ids), len(self.base_learners)))
        
        for i, base in enumerate(self.base_learners):
            print ("Gathering meta feature from '{}'...".format(base))
            self.meta_features_train[:, i] = pd.read_csv("./meta_features/train/train_{}.csv".format(base), \
                                                         index_col=0).as_matrix().ravel()
            self.meta_features_test[:, i] = pd.read_csv("./meta_features/test/test_{}.csv".format(base), \
                                                        index_col=0).as_matrix().ravel()
            
        return self.meta_features_train.copy(), self.meta_features_test.copy()
    
    
    def fit_meta_learners_and_predict(self):
        """ Generates predictions using all meta features generated from the base learners for each meta learner. """
        if not self.meta_features_train or self.meta_features_test:
            raise ValueError("get_meta_features() should be called before generate_out_of_folds_preds.")
        
        self.meta_learner_preds = np.zeros(len(self.test_ids), len(self.meta_learners))
        
        for i, meta in enumerate(meta_learners):
            meta.fit(self.meta_features_train, self.y_train)
            self.meta_learner_preds[:, i] =  meta.predict_proba(self.meta_features_test)[:,1]
            
    
    def get_final_preds(self):
        return np.mean(self.meta_learner_preds, axis=1)

## Parameter Tuning & Predictions

#### Ensemble 1: All base learners

In [103]:
base_learners_v1 = [
    "random_forest_raw",
    "random_forest_log",
    "random_forest_poly",
    "xgboost_raw",
    "xgboost_base",
    "xgboost_poly",
    "logistic_regression_log",
    "adaboost_base"
]

xgb_meta_v1 = XGBClassifier()
rf_meta_v1 = RandomForestClassifier()

meta_learners = [xgb_meta, rf_meta]

In [104]:
stacker = Stacker(base_learners_v1, meta_learners, y_train, test_ids)
meta_features_train_v1, meta_features_test_v1 = stacker.get_meta_features()

Gathering meta feature from 'random_forest_raw'...
Gathering meta feature from 'random_forest_log'...
Gathering meta feature from 'random_forest_poly'...
Gathering meta feature from 'xgboost_raw'...
Gathering meta feature from 'xgboost_base'...
Gathering meta feature from 'xgboost_poly'...
Gathering meta feature from 'logistic_regression_log'...
Gathering meta feature from 'adaboost_base'...


In [None]:
rf_meta_v1_param_grid = {
    "max_features": range(5, 10 + 1),
    "max_depth": range(5, 10 + 1),
    "n_estimators": range(300, 1000 + 1, 100)
}

gs_rf_v1 = GridSearchCV(estimator=rf_meta_v1, param_grid=rf_meta_v1_param_grid, cv=n_splits, scoring="roc_auc", n_jobs=-1)
gs_rf_v1.fit(meta_features_train_v1, y_train)

In [None]:
print gs_rf_v1.best_params_
print gs_rf_v1.best_score_

In [None]:
# stacker.fit_meta_learners_and_predict()
# stacker_preds = stacker.get_final_preds()

In [None]:
# TODO: Test different variety of base learners for stacking

#### ML-ensemble Predictions

In [None]:
sl_ensemble = SuperLearner(random_state=seed)


In [65]:
from scipy.stats import uniform
print uniform(1, 2)

<scipy.stats._distn_infrastructure.rv_frozen object at 0x119422850>
