The only purpose of this notebook is to generate a submission.

In [1]:
LOCAL = True
data_fpath = '../data/raw/' if LOCAL else '/kaggle/input/protein-localization/'
data_eng_fpath = '../data/intermediate/' if LOCAL else '../input/data-engineering/'
out_fpath = '../data/intermediate/' if LOCAL else ''
model_fpath = '../results/models/' if LOCAL else ''
selected_feats_fpath = '../data/intermediate/' if LOCAL else '../input/lightgbm-feature-selection/'

In [2]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import data_tools

In [3]:
from datetime import datetime
import pickle

import lightgbm as lgb
import pandas as pd
import numpy as np

SEED = np.random.randint(10000)

In [4]:
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTENC

In [5]:
# Use the accuracy metric
def accuracy(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'acc.', accuracy_score(y_true, preds), True

def balanced_accuracy(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'bal. acc.', balanced_accuracy_score(y_true, preds), True

def f1_weighted(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'f1', f1_score(y_true, preds, average='weighted'), True

def f1_macro(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'f1', f1_score(y_true, preds, average='macro'), True

## Load Training DataFrame
* Here I instead load the one-hot encoded file

In [6]:
X = pd.read_pickle(f"{data_eng_fpath}X_enc.pkl")
y = pd.read_pickle(f"{data_eng_fpath}y.pkl")
X.shape, y.shape

((862, 2041), (862,))

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 862 entries, 0 to 861
Columns: 2041 entries, 445 to strongest_localization_vacuole
dtypes: float64(64), int64(25), uint8(1952)
memory usage: 2.2 MB


## Load Competition Data
* Note that data engineering pipeline drops labels, so we'll need this

In [8]:
testdf = pd.read_csv(f"{data_fpath}test.csv", header=None)

## Feature Selection
* Features are not reduced yet! May need to do more feature selection later.

## Split Data
* Because some classes literally have one training instance, first I duplicate those values so they can appear in test data (best we can do tbh)

In [9]:
ros = RandomOverSampler(sampling_strategy={i : 5 for i in range(12, 15)})
X_upsampled, y_upsampled = ros.fit_resample(X, y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_upsampled, y_upsampled, test_size=0.2, stratify=y_upsampled)

X_train.shape, X_test.shape

((698, 2041), (175, 2041))

### Random Forest

In [11]:
rf_params = {
    'C': 0.233,
    'kernel': 'linear',
}
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [12]:
y_pred = pd.Series(model.predict(X_test))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.96      0.87        73
           1       0.58      0.66      0.62        38
           2       0.77      0.71      0.74        14
           3       0.44      0.33      0.38        12
           4       1.00      0.33      0.50         9
           5       1.00      0.33      0.50         9
           6       1.00      0.71      0.83         7
           7       0.67      0.50      0.57         4
           8       0.67      0.67      0.67         3
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         1
          12       0.50      1.00      0.67         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1

    accuracy                           0.73       175
   macro avg       0.74      0.62      0.64       175
weighted avg       0.75      0.73      0.71       175



# Full model fit

In [13]:
full_model = RandomForestClassifier()
full_model.set_params()
full_fit_params={
}
full_model.fit(X, y, **full_fit_params)

RandomForestClassifier()

# Competition Data

### Create Submission

In [14]:
X_kaggle = pd.read_pickle(f"{data_eng_fpath}X_kaggle_enc.pkl")

In [15]:
y_kaggle = pd.Series(full_model.predict(X_kaggle))
pd.concat((y_kaggle.value_counts().sort_index(), y_kaggle.value_counts().sort_index() / len(y_kaggle)), axis=1)

Unnamed: 0,0,1
0,202,0.530184
1,86,0.225722
2,23,0.060367
3,27,0.070866
4,7,0.018373
5,16,0.041995
6,8,0.020997
7,4,0.010499
8,7,0.018373
9,1,0.002625


In [16]:
# Compare with class distribution of training set
pd.concat((y.value_counts().sort_index(), y.value_counts().sort_index() / len(y)), axis=1)

Unnamed: 0,2960,2960.1
0,366,0.424594
1,192,0.222738
2,69,0.080046
3,58,0.067285
4,43,0.049884
5,43,0.049884
6,35,0.040603
7,18,0.020882
8,17,0.019722
9,10,0.011601


In [17]:
# Now create the kaggle submission file
submission = pd.concat((testdf[0], y_kaggle), axis=1)
submission.columns = ['Key', 'Label']
submission = submission.sort_values('Key').reset_index(drop=True)
submission.head()

Unnamed: 0,Key,Label
0,P234062,2
1,P234081,1
2,P234086,5
3,P234087,0
4,P234094,2


In [18]:
assert set(submission['Key']) == set(testdf[0])

In [19]:
timestamp = datetime.today().strftime('%Y_%m_%d-%H.%M')
submission_fname = f"submission-{timestamp}.csv"
submission.to_csv(submission_fname, index=False, header=True)
submission_fname

'submission-2021_04_02-21.47.csv'

# Save model

In [20]:
pickle.dump(full_model, open(f"{model_fpath}random_forest.pkl", 'wb'))