The only purpose of this notebook is to generate a submission by stacking models.

In [1]:
LOCAL = True
REDUCE_FEATURE_SET = True  # Flag to use reduced features set from LightGBM feature importance
data_fpath = '../data/raw/' if LOCAL else '/kaggle/input/protein-localization/'
data_eng_fpath = '../data/intermediate/' if LOCAL else '../input/data-engineering/'
out_fpath = '../data/intermediate/' if LOCAL else ''
model_fpath = '../results/models/' if LOCAL else ''
selected_feats_fpath = '../data/intermediate/' if LOCAL else '../input/lightgbm-feature-selection/'

In [2]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import data_tools

In [3]:
from datetime import datetime
import pickle

import lightgbm as lgb
import pandas as pd
import numpy as np

In [4]:
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTENC

## Load Training Data

In [5]:
X = pd.read_pickle(f"{data_eng_fpath}X_enc.pkl")
y = pd.read_pickle(f"{data_eng_fpath}y.pkl")
X.shape, y.shape

((862, 2112), (862,))

## Feature Selection

In [6]:
if REDUCE_FEATURE_SET:
    with open(f'{selected_feats_fpath}important_feats.pkl', 'rb') as handle:
        important_feats = pickle.load(handle)
    important_feats = X.columns
    X = X[important_feats]

## Load Competition Data
* Note that data engineering pipeline drops labels, so we'll need this

In [7]:
testdf = pd.read_csv(f"{data_fpath}test.csv", header=None)

## Upsample Data
* Because some classes literally have one training instance, first I duplicate those values so they can appear in test data (best we can do tbh)

In [8]:
ros = RandomOverSampler(sampling_strategy={i : 5 for i in range(12, 15)})
X_upsampled, y_upsampled = ros.fit_resample(X, y)

## Train-Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_upsampled, y_upsampled, test_size=0.2, stratify=y_upsampled)

X_train.shape, X_test.shape

((698, 2112), (175, 2112))

## Get Data Type Feature Lists

In [10]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']  # unisnged ints are for one_hot encoding
numeric_cols = X.select_dtypes(include=numerics).columns
cat_cols = list(set(X.columns) - set(numeric_cols))

# Specify Models Parameters

In [11]:
svc_params = {
    'C': 0.233,
    'kernel': 'linear',
    'probability': True,
}

In [12]:
rf_params = {
    'max_depth' : 500,
    'min_samples_leaf' : 2,
    'min_samples_split' : 5,
    'max_features' : None,
    'n_estimators' : 250,
    'oob_score' : True,
    'n_jobs' : -1,
}

In [13]:
lgbm_params1 = {
    'boosting_type': 'gbdt',
    'objective' : 'multiclass',
    'num_leaves': 400,
    'max_depth' : 8,
    'colsample_bytree' : 0.8,
    'subsample' : 0.9,
    'min_child_samples': 5,
    'learning_rate' : 0.005,
    'n_estimators': 300,
}

In [14]:
lgbm_params2 = {
    'boosting_type': 'gbdt',
    'objective' : 'multiclass',
    'num_leaves': 400,
    'max_depth' : 8,
    'colsample_bytree' : 0.8,
    'subsample' : 0.9,
    'min_child_samples': 5,
    'learning_rate' : 0.01,
    'n_estimators': 150,
}

In [15]:
logr_params = {
    'C' : 1.15,
    'penalty' : 'l1',
    'solver' : 'liblinear',
    'n_jobs' : -1,
}

In [16]:
knn_params = {
    'n_neighbors' : 7,
    'weights' : 'distance'
}

# Stacking Model

In [17]:
svc_pipeline = Pipeline(steps= [
    ('feature_processing', FeatureUnion(transformer_list = [
            ('categorical', FunctionTransformer(lambda data: data.loc[:, cat_cols])),
            ('numeric', Pipeline(steps = [
                ('select', FunctionTransformer(lambda data: data.loc[:, numeric_cols])),
                ('scale', StandardScaler())
                        ]))
        ])),
    ('svc', SVC(**svc_params)),
    ]
)

In [18]:
estimators = [
    ('rf1', RandomForestClassifier(**rf_params)),
#     ('rf2', RandomForestClassifier(**rf_params)),
#     ('lgbm1', lgb.LGBMClassifier(**lgbm_params1)),
    ('lgbm2', lgb.LGBMClassifier(**lgbm_params2)),
    ('logistic', LogisticRegression(**logr_params)),
#     ('knn', KNeighborsClassifier(**knn_params)),
    ('svc', svc_pipeline)
]

In [19]:
%%time
# This should take a real long time!
final_estimator_params = {
    'C' : 0.5,
    'solver' : 'liblinear',
    'penalty' : 'l1',
    'n_jobs' : -1,
}
stacker_params = {
    'passthrough' : True,
    'verbose' : 1,
    'cv' : 5,
}
model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(**final_estimator_params),
    n_jobs=-1,
    **stacker_params
)
model.fit(X_train, y_train)

CPU times: user 2.41 s, sys: 356 ms, total: 2.76 s
Wall time: 20.3 s


StackingClassifier(cv=5,
                   estimators=[('rf1',
                                RandomForestClassifier(max_depth=500,
                                                       max_features=None,
                                                       min_samples_leaf=2,
                                                       min_samples_split=5,
                                                       n_estimators=250,
                                                       n_jobs=-1,
                                                       oob_score=True)),
                               ('lgbm2',
                                LGBMClassifier(colsample_bytree=0.8,
                                               learning_rate=0.01, max_depth=8,
                                               min_child_samples=5,
                                               n_estimators=150, num_leaves=400,
                                               objective='multiclass',
                   

In [20]:
y_pred = pd.Series(model.predict(X_test))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85        73
           1       0.58      0.79      0.67        38
           2       0.75      0.64      0.69        14
           3       0.71      0.42      0.53        12
           4       0.60      0.33      0.43         9
           5       0.86      0.67      0.75         9
           6       1.00      0.43      0.60         7
           7       1.00      0.50      0.67         4
           8       1.00      1.00      1.00         3
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1

    accuracy                           0.74       175
   macro avg       0.81      0.65      0.70       175
weighted avg       0.76      0.74      0.73       175



# Full Model Fit

In [21]:
%%time
full_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(**final_estimator_params),
    n_jobs=-1,
    **stacker_params
)
full_model.fit(X, y)

CPU times: user 2.42 s, sys: 942 ms, total: 3.36 s
Wall time: 23.4 s


StackingClassifier(cv=5,
                   estimators=[('rf1',
                                RandomForestClassifier(max_depth=500,
                                                       max_features=None,
                                                       min_samples_leaf=2,
                                                       min_samples_split=5,
                                                       n_estimators=250,
                                                       n_jobs=-1,
                                                       oob_score=True)),
                               ('lgbm2',
                                LGBMClassifier(colsample_bytree=0.8,
                                               learning_rate=0.01, max_depth=8,
                                               min_child_samples=5,
                                               n_estimators=150, num_leaves=400,
                                               objective='multiclass',
                   

# Competition Data

### Create Submission

In [22]:
X_kaggle = pd.read_pickle(f"{data_eng_fpath}X_kaggle_enc.pkl")
if REDUCE_FEATURE_SET:
    X_kaggle = X_kaggle[important_feats]

In [23]:
y_kaggle = pd.Series(full_model.predict(X_kaggle))
pd.concat((y_kaggle.value_counts().sort_index(), y_kaggle.value_counts().sort_index() / len(y_kaggle)), axis=1)

Unnamed: 0,0,1
0,168,0.440945
1,124,0.325459
2,23,0.060367
3,23,0.060367
4,5,0.013123
5,15,0.03937
6,7,0.018373
7,8,0.020997
8,8,0.020997


In [24]:
# Compare with class distribution of training set
pd.concat((y.value_counts().sort_index(), y.value_counts().sort_index() / len(y)), axis=1)

Unnamed: 0,2960,2960.1
0,366,0.424594
1,192,0.222738
2,69,0.080046
3,58,0.067285
4,43,0.049884
5,43,0.049884
6,35,0.040603
7,18,0.020882
8,17,0.019722
9,10,0.011601


In [25]:
# Now create the kaggle submission file
submission = pd.concat((testdf[0], y_kaggle), axis=1)
submission.columns = ['Key', 'Label']
submission = submission.sort_values('Key').reset_index(drop=True)
submission.head()

Unnamed: 0,Key,Label
0,P234062,2
1,P234081,1
2,P234086,2
3,P234087,0
4,P234094,0


In [26]:
assert set(submission['Key']) == set(testdf[0])

In [27]:
timestamp = datetime.today().strftime('%Y_%m_%d-%H.%M')
submission_fname = f"stacked_submission-{timestamp}.csv"
submission.to_csv(submission_fname, index=False, header=True)
submission_fname

'stacked_submission-2021_04_03-15.54.csv'