Using this notebook as the main notebook to generate predictions or submission

In [1]:
!ls ../data/intermediate/

data.pkl


In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import data_tools

In [67]:
from datetime import datetime
import pickle

import lightgbm as lgb
import pandas as pd
import numpy as np

SEED = 42

## Load Training DataFrame

In [36]:
df = pd.read_pickle("../data/intermediate/data.pkl")

In [37]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2951,2952,2953,2954,2955,2956,2957,2958,2959,2960
0,P239476,Non-Essential,No,No,No,No,No,No,No,No,...,No,No,Yes,No,No,Yes,No,No,nucleus,0
1,P234427,Non-Essential,No,No,No,No,No,No,No,No,...,No,No,No,No,No,Yes,No,No,nucleus,0
2,P234429,Non-Essential,No,No,No,Yes,No,No,No,No,...,No,No,No,No,No,No,No,No,nucleus,0
3,P234430,Non-Essential,No,No,No,No,No,No,No,No,...,No,Yes,No,No,No,Yes,No,No,nucleus,0
4,P239467,Essential,No,No,No,No,No,No,No,No,...,No,No,No,No,No,Yes,No,No,nucleus,0


In [38]:
df[2960].value_counts()

0     366
1     192
2      69
3      58
4      43
5      43
6      35
7      18
8      17
9      10
10      4
11      3
12      2
13      1
14      1
Name: 2960, dtype: int64

Appears that the majority of features boolean, many are categorical, some may be continuous? Have yet to see any that are.

Also worth noting that the column names are integers, not strings.

### Feature Selection

In [39]:
field_descriptions_fpath = data_tools.field_descriptions_fpath
fields = data_tools.parse_field_descriptions(field_descriptions_fpath)

In [40]:
# Use this to select features
selected_features = set(fields[[0]][~fields[0].str.contains("aasdfasdf")].index) - {0}
len(selected_features)

2960

### Data Pipeline

In [41]:
from imblearn.over_sampling import RandomOverSampler

In [94]:
def data_pipeline(df_, selected_features, target_col=None, resample=True, dtypes=None, seed=None):
    '''Takes a DataFrame and returns features to pass into model.'''
    # Handle Missing Values
    # LightGBM should handle this
    df_ = df_.replace("?", np.nan)  # Replace ? mark with NaN
    
    # Convert to correct data types
    if dtypes is not None:
        df_ = df_.astype(dtypes)
    
    # Use only selected features
    X = df_[set(selected_features) - {target_col}]
    if target_col is not None:
        y = df_[target_col]
    
    # Upsample Minority Classes
    # Typically done on training dataset, but some classes are too small
    if resample:
        ros = RandomOverSampler(sampling_strategy={i : 5 for i in range(10, 15)}, random_state=seed)
        if target_col is not None:
            X, y = ros.fit_resample(X, y)
        
    # Return Datasets
    if target_col is not None:
        return X, y
    return X

In [262]:
# Split the dataset into features and labels (also drop column with the protein ID)
X, y = data_pipeline(df, selected_features, target_col=2960, resample=True, seed=SEED)

In [54]:
y.value_counts()

0     366
1     192
2      69
3      58
4      43
5      43
6      35
7      18
8      17
9      10
10      5
11      5
12      5
13      5
14      5
Name: 2960, dtype: int64

### LightGBM
* Will need to convery all datatype to int, float or bool (OR categorical)

In [55]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

#### Fit LightGBM Model

In [188]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.4, stratify=y_train, random_state=SEED)

In [189]:
X_train.shape, X_val.shape, X_test.shape

((420, 2959), (280, 2959), (176, 2959))

In [235]:
# Use the accuracy metric
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
def accuracy(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'acc.', accuracy_score(y_true, preds), True

def balanced_accuracy(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'bal. acc.', balanced_accuracy_score(y_true, preds), True

def f1_macro(y_true, y_pred):
    preds = y_pred.reshape(15, -1)
    preds = preds.argmax(axis = 0)
    return 'f1', f1_score(y_true, preds, average='macro'), True

In [383]:
hyper_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'random_state' : SEED,
    'reg_alpha': 0.5,
#     'reg_lambda' : 0.5,
    'max_depth' : 8,
    'num_leaves': 127,
#     'feature_fraction' : 0.8,
#     'min_child_samples': 40,
    'learning_rate' : 0.01,
    'n_estimators': 5000,
}
model = lgb.LGBMClassifier(**hyper_params)

In [389]:
fit_params={
    "early_stopping_rounds": 50, 
    "eval_metric" : f1_macro, #'logloss',
    "eval_set" : [(X_train, y_train), (X_val, y_val)],
    'eval_names': ['train', 'val'],
    'verbose': 500,
    'feature_name': 'auto', # that's actually the default
    'categorical_feature': 'auto' # that's actually the default
}
model.fit(X_train, y_train, **fit_params)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[292]	train's multi_logloss: 0.149161	train's f1: 1	val's multi_logloss: 0.265819	val's f1: 0.784054


LGBMClassifier(learning_rate=0.01, max_depth=8, n_estimators=5000,
               num_leaves=127, objective='multiclass', random_state=42,
               reg_alpha=0.5)

In [390]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        73
           1       1.00      1.00      1.00        38
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        12
           4       1.00      1.00      1.00         9
           5       1.00      1.00      1.00         9
           6       0.88      1.00      0.93         7
           7       1.00      0.50      0.67         4
           8       1.00      0.67      0.80         3
           9       1.00      1.00      1.00         2
          10       0.00      0.00      0.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1

    accuracy                           0.98       176
   macro avg       0.92      0.88      0.89       176
weighted avg       0.98   

In [391]:
balanced_accuracy_score(y_test, y_pred)

0.8777777777777778

# Full model fit

In [360]:
%%time
X_full, y_full = data_pipeline(df, selected_features, target_col=2960, resample=False)
full_model = lgb.LGBMClassifier(**hyper_params)
fit_params['verbose'] = 200
full_model.fit(X_full, y_full, **fit_params)

Training until validation scores don't improve for 10 rounds
[200]	train's multi_logloss: 0.0596509	val's multi_logloss: 0.0649741
[400]	train's multi_logloss: 0.00721598	val's multi_logloss: 0.00768096
[600]	train's multi_logloss: 0.00171657	val's multi_logloss: 0.00172802
[800]	train's multi_logloss: 0.000869687	val's multi_logloss: 0.000826277
[1000]	train's multi_logloss: 0.000594973	val's multi_logloss: 0.000547102
[1200]	train's multi_logloss: 0.000474379	val's multi_logloss: 0.000430782
[1400]	train's multi_logloss: 0.000401035	val's multi_logloss: 0.000361264
[1600]	train's multi_logloss: 0.000355136	val's multi_logloss: 0.000316594
Early stopping, best iteration is:
[1700]	train's multi_logloss: 0.000330908	val's multi_logloss: 0.000293441
CPU times: user 2min 35s, sys: 3.64 s, total: 2min 38s
Wall time: 19.4 s


LGBMClassifier(learning_rate=0.01, n_estimators=5000, objective='multiclass',
               random_state=42)

# Competition Data

### Load Competition Data

In [108]:
with open('../data/intermediate/data_types_dict.pkl', 'rb') as handle:
    dtypes = pickle.load(handle)

In [109]:
testdf = pd.read_csv("../data/raw/test.csv", header=None)

In [110]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Columns: 2960 entries, 0 to 2959
dtypes: float64(592), int64(632), object(1736)
memory usage: 8.6+ MB


In [111]:
dtypes.pop(2960)  # Remove target variable from datatype specification

'category'

In [95]:
X_kaggle_ids = testdf[0]
X_kaggle = data_pipeline(testdf, selected_features - {2960}, resample=False, dtypes=dtypes)

In [96]:
assert set(testdf[0]) == set(X_kaggle_ids)

### Create Submission

Okay, now let's pass the testdf through the pipeline we used earlier.

In [392]:
y_kaggle = pd.Series(full_model.predict(X_kaggle))
y_kaggle.value_counts().sort_index() / len(y_kaggle)

0     0.601050
1     0.044619
2     0.076115
3     0.036745
4     0.118110
5     0.026247
6     0.007874
7     0.007874
8     0.026247
11    0.039370
14    0.015748
dtype: float64

In [333]:
# Compare with class distribution of training set
df[2960].value_counts().sort_index() / len(df[2960])

0     0.424594
1     0.222738
2     0.080046
3     0.067285
4     0.049884
5     0.049884
6     0.040603
7     0.020882
8     0.019722
9     0.011601
10    0.004640
11    0.003480
12    0.002320
13    0.001160
14    0.001160
Name: 2960, dtype: float64

In [362]:
# Now create the kaggle submission file
submission = pd.concat((X_kaggle_ids, y_kaggle), axis=1)
submission.columns = ['Key', 'Label']
submission = submission.sort_values('Key').reset_index(drop=True)
submission.head()

Unnamed: 0,Key,Label
0,P234062,2
1,P234081,0
2,P234086,2
3,P234087,0
4,P234094,0


In [363]:
assert set(submission['Key']) == set(testdf[0])

In [364]:
timestamp = datetime.today().strftime('%Y_%m_%d-%H.%M')
submission_fname = f"submission-{timestamp}.csv"
submission.to_csv(submission_fname, index=False, header=True)
submission_fname

'submission-2021_03_30-02.06.csv'

## Save Model