The only purpose of this notebook is to generate a submission.

In [1]:
LOCAL = True
data_fpath = '../data/raw/' if LOCAL else '/kaggle/input/protein-localization/'
data_eng_fpath = '../data/intermediate/' if LOCAL else '../input/data-engineering/'
out_fpath = '../data/intermediate/' if LOCAL else ''
selected_feats_fpath = '../data/intermediate/' if LOCAL else '../input/lightgbm-feature-selection/'

In [2]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import data_tools

In [3]:
from datetime import datetime
import pickle

import pandas as pd
import numpy as np

SEED = 420

In [4]:
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTENC

In [5]:
# Use the accuracy metric
def accuracy(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'acc.', accuracy_score(y_true, preds), True

def balanced_accuracy(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'bal. acc.', balanced_accuracy_score(y_true, preds), True

def f1_weighted(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'f1', f1_score(y_true, preds, average='weighted'), True

def f1_macro(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'f1', f1_score(y_true, preds, average='macro'), True

## Load Training DataFrame

In [6]:
X = pd.read_pickle(f"{data_eng_fpath}X.pkl")
y = pd.read_pickle(f"{data_eng_fpath}y.pkl")
X.shape, y.shape

((862, 476), (862,))

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862 entries, 0 to 861
Columns: 476 entries, 1 to interaction_max3
dtypes: category(447), float64(9), int64(20)
memory usage: 585.4 KB


## Load Competition Data
* Note that data engineering pipeline drops labels, so we'll need thos

In [8]:
testdf = pd.read_csv(f"{data_fpath}test.csv", header=None)

## Feature Selection
* See feature selection notebook

In [9]:
with open(f'{selected_feats_fpath}important_feats.pkl', 'rb') as handle:
    important_feats = pickle.load(handle)

In [10]:
X = X[important_feats]

## Split Data
* Because some classes literally have one training instance, first I duplicate those values so they can appear in test data (best we can do tbh)
* Then I upsample training data again, because we need this to satisfy SMOTE conditions
* Afterwards, I split to train/test, and then use SMOTE/ADASYN on minority classes

In [11]:
ros = RandomOverSampler(sampling_strategy={i : 2 for i in range(12, 15)})
X_upsampled, y_upsampled = ros.fit_resample(X, y)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_upsampled, y_upsampled, test_size=0.2, stratify=y_upsampled)

# If you use a validation set, you'll need to change the SMOTE cells

# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, y_train, test_size=0.2, stratify=y_train, random_state=SEED)
# X_train.shape, X_val.shape, X_test.shape
X_train.shape, X_test.shape

((691, 137), (173, 137))

In [13]:
categoricals = X_train.select_dtypes(include=['category']).columns
categoricals_by_idx = [X_train.columns.get_loc(c) for c in categoricals]

In [14]:
# Can't have NaNs for SMOTE, luckily not too many to impute (see data_engineering notebook)
# Impute numerical by average of class
# For categorical, give a new class called "missing"
X_train.isna().any().any()

False

In [15]:
train_class_count = y_train.value_counts().to_dict()

In [18]:
# This specifies how much I want to amplify the minority classes
# Don't want to bias the model too much so I'll have to be careful here not to amplify too much
N_NBRS_SMOTE = 6  # SMOTE kNN condition

strat_1 = {i : round(train_class_count[i]*1.4) for i in range(1, 4)}  # Classes 1 to 3
strat_2 = {i : round(train_class_count[i]*1.8) for i in range(4, 9)}  # Classes 4 to 8
strat_3 = {i : max(round(train_class_count[i]*1.5), N_NBRS_SMOTE) for i in range(9, 15)}  # Classes 9 to 14
sampling_strategy = {**strat_1, **strat_2, **strat_3}

In [20]:
ros = RandomOverSampler(sampling_strategy={i : max(N_NBRS_SMOTE, train_class_count[i]) for i in range(10, 15)})
X_train, y_train = ros.fit_resample(X_train, y_train)

In [21]:
# Use SMOTE
sm = SMOTENC(categorical_features=categoricals_by_idx,
             sampling_strategy= sampling_strategy,
             k_neighbors=N_NBRS_SMOTE - 1,
             n_jobs=-1
            )
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

ValueError: could not broadcast input array from shape (6,233) into shape (154,233)

In [None]:
y_resampled.value_counts()

# Model: Random Forest

In [None]:
# One-hot encode (overwrites)
X_resampled = pd.get_dummies(data=X_resampled, columns=X.columns[X.dtypes == 'category'], drop_first=False)
X_test = pd.get_dummies(data=X_test, columns=X.columns[X.dtypes == 'category'], drop_first=False)
X_resampled.shape, X_test.shape

In [None]:
hyper_params = {
}
model = RandomForestClassifier(**hyper_params)

In [None]:
fit_params={
}
model.fit(X_resampled, y_resampled, **fit_params)

In [None]:
y_pred = pd.Series(model.predict(X_test))
print(classification_report(y_test, y_pred))

# Full model fit

In [None]:
class_count = y.value_counts().to_dict()
class_count  # Class count of full dataset

## Resample Full Training Set

In [None]:
strat_1 = {i : round(class_count[i]*1.4) for i in range(1, 4)}  # Classes 1 to 3
strat_2 = {i : round(class_count[i]*1.8) for i in range(4, 9)}  # Classes 4 to 8
strat_3 = {i : max(round(class_count[i]*2), N_NBRS_SMOTE) for i in range(9, 15)}  # Classes 10 to 14
sampling_strategy = {**strat_1, **strat_2, **strat_3}

In [None]:
# ** Only run this cell if you need to oversample for SMOTE (i.e. number in class < N_NBRS_SMOTE)
ros = RandomOverSampler(sampling_strategy={i : N_NBRS_SMOTE for i in range(10, 15)})
X, y = ros.fit_resample(X, y)

In [None]:
# Use SMOTE
sm = SMOTENC(categorical_features=categoricals_by_idx,
             sampling_strategy= sampling_strategy,
             k_neighbors=N_NBRS_SMOTE - 1,
             n_jobs=-1
            )
X_full_resampled, y_full_resampled = sm.fit_resample(X, y)

In [None]:
y_full_resampled.value_counts()

## Fit model on full dataset

In [None]:
full_model = RandomForestClassifier()
full_model.set_params(**hyper_params)
full_fit_params={
}
X_full_resampled = pd.get_dummies(
    data=X_full_resampled, columns=X.columns[X.dtypes == 'category'], drop_first=False)
full_model.fit(X_full_resampled, y_full_resampled, **full_fit_params)

# Competition Data

### Create Submission

In [None]:
X_kaggle = pd.read_pickle(f"{data_eng_fpath}X_kaggle.pkl")
X_kaggle = pd.get_dummies(
    data=X_kaggle, columns=X.columns[X.dtypes == 'category'], drop_first=False)
X_kaggle.shape

In [None]:
y_kaggle = pd.Series(full_model.predict(X_kaggle[important_feats]))
pd.concat((y_kaggle.value_counts().sort_index(), y_kaggle.value_counts().sort_index() / len(y_kaggle)), axis=1)

In [None]:
# Compare with class distribution of training set
pd.concat((y.value_counts().sort_index(), y.value_counts().sort_index() / len(y)), axis=1)

In [None]:
# Now create the kaggle submission file
submission = pd.concat((testdf[0], y_kaggle), axis=1)
submission.columns = ['Key', 'Label']
submission = submission.sort_values('Key').reset_index(drop=True)
submission.head()

In [None]:
assert set(submission['Key']) == set(testdf[0])

In [None]:
timestamp = datetime.today().strftime('%Y_%m_%d-%H.%M')
submission_fname = f"submission-{timestamp}.csv"
submission.to_csv(submission_fname, index=False, header=True)
submission_fname