This is a simple and direct lightGBM model.\
Here, I will only load the data, run the lightGBM model and submit it.\
The purpose of this notebook is to verify if running one lightGBM per MoA is possible considering that we have only 9 hours.

In [2]:
import time
import pandas as pd
import numpy as np
import re

import lightgbm as lgb

In [3]:
#load the Data
path = 'C:\\Users\\maxwi\\Python\\Kaggle\\Mechanism of action\\'
path_holdout = 'C:\\Users\\maxwi\\Python\\Kaggle\\Mechanism of action\\results 1 - lightgbm test\\'

train_features = pd.read_csv('train_features.csv')
train_targets_scored = pd.read_csv('train_targets_scored.csv')
test_features = pd.read_csv('test_features.csv')

In [5]:
#Some adjustments so we can rum the lightGBM
train_x = train_features.drop(['sig_id'], axis = 1)
test_x = test_features.drop(['sig_id'], axis = 1)
train_y = train_targets_scored.drop(['sig_id'], axis = 1)

train_x['cp_type'] = train_features.apply(lambda row: 0 if row['cp_type'] == 'trt_cp' else 1, axis = 1)
test_x['cp_type'] = test_features.apply(lambda row: 0 if row['cp_type'] == 'trt_cp' else 1, axis = 1)

train_x['cp_dose'] = train_features['cp_dose'].str.extract(r"([1-2])", expand = True).astype(np.int8)
test_x['cp_dose'] = test_features['cp_dose'].str.extract(r"([1-2])", expand = True).astype(np.int8)

In [6]:
#Modeling

progress = 0  #Usefull to see the progress of the code. 
progress_20 = 1
start = time.time()

predicted_MoA_test = pd.DataFrame(test_features['sig_id'])
for col in train_y:
    
    if col == 'sig_id':
        continue
        
    lgb_params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "bagging_freq": 20,
        "bagging_fraction": 0.3,
        "feature_fraction": 0.6,
        "learning_rate": 0.01,
        "lambda_l2": 0.1,
        'verbosity': 1,
        'num_iterations': 2000,
        #'early_stopping_round': 200,
        'num_leaves': 400,
        "min_data_in_leaf": 200,
        'seed': 1
    }
    
    features = train_x.columns.tolist()
    lgb_train = lgb.Dataset(data = train_x, label = train_y[col], feature_name = features)
    model = lgb.train(train_set = lgb_train, params = lgb_params)
    
    #Prediction
    predicted_MoA = []
    predicted_MoA = model.predict(test_x)
    predicted_MoA_test[col] = predicted_MoA
    
    
    #Running time
    progress += 1
    if progress == progress_20 * 20:

        progress_per = round(progress / len(train_y.columns), 4)
        print(progress_per)
        progress_20 +=1
        
        end = time.time()
        elapsed = int(round(end - start, 0))
        total_run_time =  int(round(elapsed / (progress_per), 0))
        time_to_finish = int(round(elapsed / (progress_per), 0)) - elapsed
        print('Elapsed: {:02d}:{:02d}:{:02d}'.format(elapsed // 3600, (elapsed % 3600 // 60), elapsed % 60))
        print('Total run time: {:02d}:{:02d}:{:02d}'.format(total_run_time // 3600, (total_run_time % 3600 // 60), total_run_time % 60))
        print('Time to finish: {:02d}:{:02d}:{:02d}'.format(time_to_finish // 3600, (time_to_finish % 3600 // 60), time_to_finish % 60))
        print()



0.0971
Elapsed: 00:05:46
Total run time: 00:59:23
Time to finish: 00:53:37

0.1942
Elapsed: 00:09:55
Total run time: 00:51:04
Time to finish: 00:41:09

0.2913
Elapsed: 00:15:18
Total run time: 00:52:31
Time to finish: 00:37:13

0.3883
Elapsed: 00:21:12
Total run time: 00:54:36
Time to finish: 00:33:24

0.4854
Elapsed: 00:26:55
Total run time: 00:55:27
Time to finish: 00:28:32

0.5825
Elapsed: 00:32:42
Total run time: 00:56:08
Time to finish: 00:23:26

0.6796
Elapsed: 00:37:17
Total run time: 00:54:52
Time to finish: 00:17:35

0.7767
Elapsed: 00:42:49
Total run time: 00:55:08
Time to finish: 00:12:19

0.8738
Elapsed: 00:47:59
Total run time: 00:54:55
Time to finish: 00:06:56

0.9709
Elapsed: 00:52:50
Total run time: 00:54:25
Time to finish: 00:01:35



In [7]:
#Submit
predicted_MoA_test.to_csv(path_holdout + "predicted_MoA_test_1.csv", index=False)

In [8]:
predicted_MoA_test

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,4.633303e-07,5.176644e-07,4.526468e-07,0.000817,0.000929,0.000047,0.000003,3.294883e-04,7.467366e-07,...,0.000252,1.078825e-06,0.000006,0.000004,0.000005,0.000209,0.000027,3.594104e-06,2.324635e-06,6.570868e-06
1,id_001897cda,3.931169e-07,1.669937e-06,9.477418e-08,0.000033,0.000028,0.000013,0.000001,4.272805e-06,1.007431e-05,...,0.000252,5.214632e-07,0.000003,0.000005,0.000169,0.000262,0.000072,3.952129e-07,3.032162e-06,4.250379e-06
2,id_002429b5b,2.403483e-07,3.407788e-07,4.602379e-07,0.000023,0.001226,0.000005,0.000007,1.603052e-05,3.822358e-07,...,0.000252,2.899441e-07,0.000002,0.000005,0.000010,0.000224,0.000006,3.059014e-06,3.818069e-06,1.698994e-06
3,id_00276f245,4.719366e-07,4.381612e-07,3.508681e-07,0.000158,0.000797,0.000011,0.000002,1.607143e-05,3.602734e-07,...,0.000252,5.294573e-07,0.000002,0.000077,0.000058,0.000209,0.000011,6.201279e-06,3.180223e-07,1.119708e-06
4,id_0027f1083,1.753346e-06,5.766360e-07,1.237868e-06,0.000103,0.002108,0.000003,0.000015,1.716774e-05,7.029271e-07,...,0.000252,1.361636e-07,0.000010,0.000003,0.000004,0.000357,0.000014,1.241627e-06,2.034920e-07,1.665756e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,4.420928e-07,3.341861e-06,1.369218e-07,0.000006,0.000063,0.000012,0.000003,9.419025e-07,2.129417e-06,...,0.000252,8.542916e-07,0.000006,0.010758,0.000762,0.000209,0.001533,7.801287e-07,1.600499e-06,6.501145e-07
3978,id_ff925dd0d,1.451268e-06,3.862461e-06,3.584485e-07,0.000021,0.001641,0.000027,0.000026,3.043159e-05,1.108610e-06,...,0.000252,1.444368e-06,0.000004,0.000005,0.000005,0.000210,0.000029,4.577003e-07,4.491344e-07,5.453368e-07
3979,id_ffb710450,4.581571e-07,9.475839e-07,1.795247e-07,0.000069,0.003186,0.000020,0.000081,8.861029e-06,2.678463e-07,...,0.000252,4.317092e-07,0.000010,0.000010,0.000002,0.000209,0.000004,3.431395e-06,1.259530e-06,4.063035e-06
3980,id_ffbb869f2,9.804414e-07,1.254736e-06,5.357085e-07,0.000107,0.005242,0.000003,0.000012,2.099556e-05,9.483171e-07,...,0.000252,2.318614e-06,0.000007,0.000007,0.000004,0.000209,0.000013,2.084600e-06,7.839304e-07,1.259511e-06
