The only purpose of this notebook is to generate a submission by stacking models.

In [307]:
LOCAL = False
REDUCE_FEATURE_SET = True
data_fpath = '../data/raw/' if LOCAL else '/kaggle/input/protein-localization/'
data_eng_fpath = '../data/intermediate/' if LOCAL else '../input/data-engineering/'
out_fpath = '../data/intermediate/' if LOCAL else ''
model_fpath = '../results/models/' if LOCAL else ''
selected_feats_fpath = '../data/intermediate/' if LOCAL else '../input/lightgbm-feature-selection/'

In [308]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import data_tools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [309]:
from datetime import datetime
import pickle

import lightgbm as lgb
import pandas as pd
import numpy as np

SEED = np.random.randint(10000)
SEED

997

In [310]:
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTENC

## Load Training Data

In [311]:
X = pd.read_pickle(f"{data_eng_fpath}X_enc.pkl")
y = pd.read_pickle(f"{data_eng_fpath}y.pkl")
X.shape, y.shape

((862, 2041), (862,))

## Feature Selection

In [312]:
if REDUCE_FEATURE_SET:
    with open(f'{selected_feats_fpath}important_feats.pkl', 'rb') as handle:
        important_feats = pickle.load(handle)
    important_feats = X.columns
    X = X[important_feats]

## Load Competition Data
* Note that data engineering pipeline drops labels, so we'll need this

In [313]:
testdf = pd.read_csv(f"{data_fpath}test.csv", header=None)

## Upsample Data
* Because some classes literally have one training instance, first I duplicate those values so they can appear in test data (best we can do tbh)

In [314]:
ros = RandomOverSampler(sampling_strategy={i : 5 for i in range(12, 15)})
X_upsampled, y_upsampled = ros.fit_resample(X, y)

## Train-Test Split

In [315]:
X_train, X_test, y_train, y_test = train_test_split(
    X_upsampled, y_upsampled, test_size=0.2, stratify=y_upsampled)

X_train.shape, X_test.shape

((698, 2041), (175, 2041))

## Get Data Type Feature Lists

In [316]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']  # unisnged ints are for one_hot encoding
numeric_cols = X.select_dtypes(include=numerics).columns
cat_cols = list(set(X.columns) - set(numeric_cols))

# Specify Models Parameters

In [317]:
svc_params = {
    'C': 0.233,
    'kernel': 'linear',
    'probability': True,
}

In [318]:
rf_params = {
    'max_depth' : 500,
    'min_samples_leaf' : 2,
    'min_samples_split' : 5,
    'max_features' : None,
    'n_estimators' : 250,
    'oob_score' : True,
    'n_jobs' : -1,
}

In [319]:
lgbm_params1 = {
    'boosting_type': 'gbdt',
    'objective' : 'multiclass',
    'num_leaves': 400,
    'max_depth' : 8,
    'colsample_bytree' : 0.8,
    'subsample' : 0.9,
    'min_child_samples': 5,
    'learning_rate' : 0.005,
    'n_estimators': 300,
}

In [320]:
lgbm_params2 = {
    'boosting_type': 'gbdt',
    'objective' : 'multiclass',
    'num_leaves': 400,
    'max_depth' : 8,
    'colsample_bytree' : 0.8,
    'subsample' : 0.9,
    'min_child_samples': 5,
    'learning_rate' : 0.01,
    'n_estimators': 150,
}

In [321]:
logr_params = {
    'C' : 1.15,
    'penalty' : 'l1',
    'solver' : 'liblinear',
    'n_jobs' : -1,
}

In [322]:
knn_params = {
    'n_neighbors' : 7,
    'weights' : 'distance'
}

# Stacking Model

In [323]:
svc_pipeline = Pipeline(steps= [
    ('feature_processing', FeatureUnion(transformer_list = [
            ('categorical', FunctionTransformer(lambda data: data.loc[:, cat_cols])),
            ('numeric', Pipeline(steps = [
                ('select', FunctionTransformer(lambda data: data.loc[:, numeric_cols])),
                ('scale', StandardScaler())
                        ]))
        ])),
    ('svc', SVC(**svc_params)),
    ]
)

In [324]:
estimators = [
    ('rf1', RandomForestClassifier(**rf_params)),
    ('rf2', RandomForestClassifier(**rf_params)),
    ('lgbm1', lgb.LGBMClassifier(**lgbm_params1)),
    ('lgbm2', lgb.LGBMClassifier(**lgbm_params2)),
    ('logistic', LogisticRegression(**logr_params)),
    ('knn', KNeighborsClassifier(**knn_params)),
    ('svc', svc_pipeline)
]

In [None]:
%%time
# This should take a real long time!
final_estimator_params = {
    'C' : 5,
    'solver' : 'liblinear',
    'penalty' : 'l1',
    'n_jobs' : -1,
}
stacker_params = {
    'passthrough' : False,
    'verbose' : 1,
    'cv' : 5,
}
model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(**final_estimator_params),
    n_jobs=-1,
    **stacker_params
)
model.fit(X_train, y_train)

In [None]:
y_pred = pd.Series(model.predict(X_test))
print(classification_report(y_test, y_pred))

# Full Model Fit

In [None]:
%%time
full_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(**final_estimator_params),
    n_jobs=-1,
    **stacker_params
)
full_model.fit(X, y)

# Competition Data

### Create Submission

In [None]:
X_kaggle = pd.read_pickle(f"{data_eng_fpath}X_kaggle_enc.pkl")
if REDUCE_FEATURE_SET:
    X_kaggle = X_kaggle[important_feats]

In [None]:
y_kaggle = pd.Series(full_model.predict(X_kaggle))
pd.concat((y_kaggle.value_counts().sort_index(), y_kaggle.value_counts().sort_index() / len(y_kaggle)), axis=1)

In [None]:
# Compare with class distribution of training set
pd.concat((y.value_counts().sort_index(), y.value_counts().sort_index() / len(y)), axis=1)

In [None]:
# Now create the kaggle submission file
submission = pd.concat((testdf[0], y_kaggle), axis=1)
submission.columns = ['Key', 'Label']
submission = submission.sort_values('Key').reset_index(drop=True)
submission.head()

In [None]:
assert set(submission['Key']) == set(testdf[0])

In [None]:
timestamp = datetime.today().strftime('%Y_%m_%d-%H.%M')
submission_fname = f"submission-{timestamp}.csv"
submission.to_csv(submission_fname, index=False, header=True)
submission_fname

# Save model

In [None]:
pickle.dump(full_model, open(f"{model_fpath}random_forest.pkl", 'wb'))