In [193]:
!ls ../data/intermediate/

data.pkl


In [194]:
%load_ext autoreload
%autoreload 2
import data_tools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [195]:
import lightgbm as lgb
import pandas as pd
import numpy as np

In [273]:
df = pd.read_pickle("../data/intermediate/data.pkl")

In [296]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862 entries, 0 to 861
Columns: 2961 entries, 0 to 2960
dtypes: category(2961)
memory usage: 2.7 MB


Appears that the majority of features boolean, many are categorical, some may be continuous? Have yet to see any that are.

Also worth noting that the column names are integers, not strings.

In [274]:
field_descriptions_fpath = data_tools.field_descriptions_fpath
fields = data_tools.parse_field_descriptions(field_descriptions_fpath)

### Training Set Output (use this for model)

In [301]:
# Let's take all the features except for protein interactions
selected_features = set(fields[[0]][~fields[0].str.contains("interaction")].index) - {0, 2960}

In [360]:
def data_pipeline(df_, selected_features):
    '''Returns the features needed for the lgbm model.'''
    # Split the dataset into features and labels (also drop column with the protein ID)
    # Replace missing values with nan
    df_ = df_[selected_features]
    df_ = df_.replace("?", np.nan)
    return df_

In [361]:
# Split the dataset into features and labels (also drop column with the protein ID)
X, y = data_pipeline(df, selected_features), df[2960]
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,2950,2951,2952,2953,2954,2955,2956,2957,2958,2959
0,Non-Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,Yes,No,No,Yes,No,No,nucleus
1,Non-Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,Yes,No,No,nucleus
2,Non-Essential,No,No,No,Yes,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,nucleus
3,Non-Essential,No,No,No,No,No,No,No,No,No,...,No,No,Yes,No,No,No,Yes,No,No,nucleus
4,Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,Yes,No,No,nucleus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,Non-Essential,No,No,No,No,No,No,No,No,No,...,Yes,No,No,No,No,No,No,No,No,cytoskeleton
858,Non-Essential,No,No,No,No,No,No,No,No,No,...,Yes,No,No,No,Yes,No,No,No,No,transport vesicles
859,Non-Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,Yes,No,No,No,cytoplasm
860,Essential,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,cytoskeleton


### Categorical Variables
* one-hot encode categoricals
* maybe find a package that can handle them, since sklearn and XGBoost cannot

### LightGBM
* Will need to convery all datatype to int, float or bool (OR categorical)

In [362]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [363]:
X.info()  # look at how many are type "object"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862 entries, 0 to 861
Columns: 2959 entries, 1 to 2959
dtypes: category(2959)
memory usage: 2.7 MB


#### Fit LightGBM Model

In [364]:
# Before splitting the dataset we should to isolate the classes that only have one training instance
# Otherwise we can't stratify the train test split
rare_class_idxs = y[y.isin(['13', '14'])].index
X_rare = X.iloc[rare_class_idxs]
y_rare = y[rare_class_idxs]

In [365]:
X_train, X_test, y_train, y_test = train_test_split(
    X.drop(index=rare_class_idxs),
    y.drop(index=rare_class_idxs),
    test_size=0.2,
    stratify=y.drop(index=rare_class_idxs)
)
X_train = pd.concat((X_train, X_rare), axis=0)
y_train = pd.concat((y_train, y_rare), axis=0)

In [366]:
X_train.shape, X_test.shape

((690, 2959), (172, 2959))

In [367]:
model = lgb.LGBMClassifier()

In [368]:
fit_params={
    "early_stopping_rounds":10, 
    "eval_metric" : 'logloss', 
    "eval_set" : [(X_test, y_test)],
    'eval_names': ['valid'],
    'verbose': 100,
    'feature_name': 'auto', # that's actually the default
    'categorical_feature': 'auto' # that's actually the default
}
model.fit(X_train, y_train, **fit_params)



Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[35]	valid's multi_logloss: 0.0845902


LGBMClassifier()

In [369]:
y_pred = model.predict(X_test)

In [370]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        73
           1       1.00      1.00      1.00        38
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        12
           4       1.00      1.00      1.00         9
           5       1.00      1.00      1.00         9
           6       1.00      1.00      1.00         7
           7       0.75      1.00      0.86         3
           8       1.00      1.00      1.00         3
           9       1.00      1.00      1.00         2

    accuracy                           0.99       172
   macro avg       0.81      0.83      0.82       172
weighted avg       0.98      0.99      0.98       172



  _warn_prf(average, modifier, msg_start, len(result))


In [371]:
vars(model)

{'boosting_type': 'gbdt',
 'objective': None,
 'num_leaves': 31,
 'max_depth': -1,
 'learning_rate': 0.1,
 'n_estimators': 100,
 'subsample_for_bin': 200000,
 'min_split_gain': 0.0,
 'min_child_weight': 0.001,
 'min_child_samples': 20,
 'subsample': 1.0,
 'subsample_freq': 0,
 'colsample_bytree': 1.0,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'random_state': None,
 'n_jobs': -1,
 'silent': True,
 'importance_type': 'split',
 '_Booster': <lightgbm.basic.Booster at 0x7f86d0e13220>,
 '_evals_result': {'valid': OrderedDict([('multi_logloss',
                [1.3652715694783304,
                 1.075512537254385,
                 0.8960786218111664,
                 0.760646090419525,
                 0.6520926115537129,
                 0.5628683413914771,
                 0.48265557600601977,
                 0.42049170874355285,
                 0.3674935697607418,
                 0.32320080248817235,
                 0.2850677371172422,
                 0.2536307050061466,
             

LGTM? Let's fit on the whole dataset now and see what we get on the Kaggle Test data.

# Full model fit

In [372]:
%%time
full_fit_params={
    "early_stopping_rounds":10, 
    "eval_metric" : 'logloss', 
    "eval_set" : [(X_test, y_test)],
    'eval_names': ['valid'],
    'verbose': 100,
    'feature_name': 'auto', # that's actually the default
    'categorical_feature': 'auto', # that's actually the default
}
full_model = lgb.LGBMClassifier()
full_model.fit(X, y, **full_fit_params)

Training until validation scores don't improve for 10 rounds
[100]	valid's multi_logloss: 0.000275287
Did not meet early stopping. Best iteration is:
[100]	valid's multi_logloss: 0.000275287
CPU times: user 20.8 s, sys: 500 ms, total: 21.3 s
Wall time: 9.73 s


LGBMClassifier()

# Competition Data

In [373]:
testdf = pd.read_csv("../data/raw/test.csv", header=None, dtype='category')

In [374]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Columns: 2960 entries, 0 to 2959
dtypes: category(2960)
memory usage: 1.2 MB


Okay, now let's pass the testdf through the pipeline we used earlier.

In [375]:
X_kaggle_ids = testdf[0]
X_kaggle = data_pipeline(testdf, selected_features)

In [377]:
assert set(testdf[0]) == set(X_kaggle_ids)

In [385]:
y_kaggle = pd.Series(full_model.predict(X_kaggle))

In [388]:
y_kaggle.value_counts()

0     242
2     110
1      17
3       8
6       2
5       1
10      1
dtype: int64

In [379]:
# Now create the kaggle submission file
submission = pd.concat((X_kaggle_ids, y_kaggle), axis=1)
submission.columns = ['Key', 'Label']
submission = submission.sort_values('Key').reset_index(drop=True)
submission.head()

Unnamed: 0,Key,Label
0,P234081,0
1,P234086,2
2,P234087,0
3,P234112,2
4,P234120,2


In [382]:
assert set(submission['Key']) == set(testdf[0])

In [383]:
submission.to_csv("submission.csv", index=False, header=True)