In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from xgboost import XGBClassifier 
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import log_loss, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import time
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait, ALL_COMPLETED, as_completed
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

data_dir = './DATA/lish-moa/'

***
# Table of Contents   
1. Load the dataset  
2. Preprocessing   
3. Models : Binary Relevance (Gradient Boost)  
***

## Load the dataset

In [2]:
X = pd.read_csv(data_dir+'train_features.csv', index_col='sig_id')
y = pd.read_csv(data_dir+'train_targets_scored.csv', index_col='sig_id')

## Preprocessing

In [3]:
# One-hot encoding for cp_type and cp_dose
X['cp_type'].replace({'trt_cp':1., 'ctl_vehicle':0.}, inplace=True)
X['cp_dose'].replace({'D1':1., 'D2':0.}, inplace=True)

# split into training set and test set
SEED = 123
np.random.seed(SEED)

ids = X.index.values.copy()
np.random.shuffle(ids)

train_perc, test_perc = 0.85, 0.2
train_id = ids[:round(len(ids)*train_perc)]
test_id = ids[round(len(ids)*train_perc):]

X_train = X.loc[train_id]
X_test = X.loc[test_id]

y_train = y.loc[train_id]
y_test = y.loc[test_id]

# normalize the data
scaler = StandardScaler()
X_train_norm = pd.DataFrame(scaler.fit_transform(X_train))
X_train_norm.columns = X_train.columns
X_train_norm.index = X_train.index

X_test_norm = pd.DataFrame(scaler.transform(X_test))
X_test_norm.columns = X_test.columns
X_test_norm.index = X_test.index

pca = PCA(n_components=700)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_norm))
X_train_pca.index = X_train.index

X_test_pca = pd.DataFrame(pca.transform(X_test_norm))
X_test_pca.index = X_test.index

## Models : Binary Relevance (Gradient Boost)

In [None]:
weight_ratio = 0.7
n_estimators = 50
max_depth = 3
early_stopping_rounds = 2

current = 0
start = 0
model_dic_gb = {}

def calculate_model(i, y):
    global model_dic_gb
    global current
    global start
    
    start += 1
    start_time = time.process_time()
    print("Start " + str(start) + " ")

    num_train = X_train.shape[0]
    X = X_train.values[:round(num_train*0.85)].copy() # train
    X1 = X_train.values[round(num_train*0.85):].copy() # validation
    y1 = y[round(num_train*0.85):] # y validatation
    y = y[:round(num_train*0.85)] # y train
    X2 = X_test.values
    # class_weight for each target column
    scale_pos_weight = min(round(len(y)/sum(y)-1)*weight_ratio, 8000)
    gbc = XGBClassifier(n_estimators = n_estimators,
                        max_depth = max_depth,
                        scale_pos_weight = scale_pos_weight,
                        n_jobs = -1,
                        random_state=100)
    eval_set = [(X1, y1)]
    gbc.fit(X, y, eval_set=eval_set, eval_metric='logloss', early_stopping_rounds = early_stopping_rounds, verbose=False);
    
    model_dic_gb[i] = gbc
    current += 1
    print("--- Completed %s, %s, %.4f mins ---" % (current, i, (time.process_time() - start_time) / 60))
    return i

with ThreadPoolExecutor(max_workers=4) as t:
    futures = []
    for i, y in y_train.iteritems():
        futures.append(t.submit(calculate_model, i, y))

    for future in as_completed(futures):
        print(future.result())
    
print(model_dic_gb.keys())

Start 1 
Start 2 
Start 3 
Start 4 
--- Completed 1, 5-alpha_reductase_inhibitor, 5.7792 mins ---
Start 5 5-alpha_reductase_inhibitor

--- Completed 2, acat_inhibitor, 6.9880 mins ---
Start 6 acat_inhibitor

--- Completed 3, 11-beta-hsd1_inhibitor, 9.0095 mins ---
Start 7 11-beta-hsd1_inhibitor

--- Completed 4, acetylcholine_receptor_agonist, 10.2732 mins ---
Start 8 acetylcholine_receptor_agonist

--- Completed 5, acetylcholine_receptor_antagonist, 10.3052 mins ---
Start 9 acetylcholine_receptor_antagonist

--- Completed 6, acetylcholinesterase_inhibitor, 10.3412 mins ---
Start 10 acetylcholinesterase_inhibitor

--- Completed 7, adenosine_receptor_agonist, 9.5530 mins ---
Start 11 adenosine_receptor_agonist

--- Completed 8, adenosine_receptor_antagonist, 10.2425 mins ---
Start 12 adenosine_receptor_antagonist

--- Completed 9, adenylyl_cyclase_activator, 5.0440 mins ---
Start 13 adenylyl_cyclase_activator

--- Completed 10, aldehyde_dehydrogenase_inhibitor, 3.7099 mins ---
Start 14 

In [None]:
y_train_predict = pd.DataFrame([], columns=y_train.columns)
y_test_predict = pd.DataFrame([], columns=y_train.columns)
X = X_train.values
X2 = X_test.values

for i, gbc in tqdm(model_dic_gb.items()):
    y_train_predict[i] = gbc.predict(X)
    y_test_predict[i] = gbc.predict(X2)

In [9]:
# binary relavance
print('weight_ratio:',weight_ratio, ' n_estimators:',n_estimators, ' max_depth:', max_depth, ' early_stopping_rounds:', early_stopping_rounds)
print('Precision: ')
print(precision_score(y_train, y_train_predict))
print('Recall: ')
print(recall_score(y_train, y_train_predict))
print('F1: ')
print(f1_score(y_train, y_train_predict))
print()
print('Precision: ')
print(precision_score(y_test, y_test_predict))
print('Recall: ')
print(recall_score(y_test, y_test_predict))
print('F1: ')
print(f1_score(y_test, y_test_predict))
print()
print(y_train.sum().sum())
print(y_train_predict.sum().sum())
print()
print(y_test.sum().sum())
print(y_test_predict.sum().sum())

weight_ratio: 0.73  n_estimators: 75  max_depth: 4  early_stopping_rounds: 2
Precision: 
0.9923206751054853
Recall: 
1.0
F1: 
0.9961455377186667

Precision: 
0.7435684647302905
Recall: 
0.2636069432185937
F1: 
0.3892267593397046

11759
11850

3399
1205


### Grid search with cross-validatation

In [None]:
folds = 5

current = 0
start = 0
best_model = {}
best_params = {}
X = X_train_pca.values
def search_model(i, col_name, y):
    global best_model
    global best_params
    global current
    global start
    
    start += 1
    start_time = time.process_time()
    print("Start " + str(start) + " ")
    
#     pca = PCA()
    gbc = XGBClassifier(learning_rate=0.008, n_jobs=-1, random_state = 90)
#     pipe = Pipeline([('pca', pca), ('xgb', gbc)])

    ratio = round(len(y)/sum(y)-1)
#     tuned_params = {
#         'pca__n_components':[500, 600, 700],
#         'xgb__scale_pos_weight':[round(weight_ratio*ratio) for weight_ratio in [0.64, 0.66, 0.68, 0.7]],
#         'xgb__max_depth': [3,4,5],
#         'xgb__gamma':[0.5, 1, 2, 4],
#         'xgb__colsample_bytree':[0.6, 0.8, 1],
#         'xgb__min_child_weight': [1, 5]
#     }
    tuned_params = {
        'scale_pos_weight':[round(weight_ratio*ratio) for weight_ratio in [0.64, 0.66, 0.68, 0.7]],
        'max_depth': [3,4,5],
        'gamma':[0.5, 1, 2, 4],
        'colsample_bytree':[0.6, 0.8, 1],
        'min_child_weight': [1, 5]
    }

    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 200)
    search = RandomizedSearchCV(estimator = gbc, param_distributions=tuned_params,
                               cv = skf.split(X, y), scoring='neg_log_loss', n_iter=60,
                               random_state=100)
    search.fit(X, y)
    best_model[col_name] = search.best_estimator_
    best_params[col_name] = search.best_params_
    
    joblib.dump(best_model[col_name], f'./TrainedModels/GradBoost/gbc_{i}.joblib')
    current += 1
    print("--- Completed %s, %s, %.4f mins ---" % (current, col_name, (time.process_time() - start_time) / 60))
    
with ThreadPoolExecutor(max_workers=4) as t:
    futures = []
    for i, (col_name, y) in enumerate(y_train.iteritems()):
        futures.append(t.submit(search_model, i, col_name, y))

    for future in as_completed(futures):
        print(future.result())

Start 1 Start 2 

Start 3 
Start 4 


In [None]:
# inference - training
y_train_pred = np.zeros(y_train.shape).astype('float')
y_train_pred_proba = np.zeros(y_train.shape).astype('float')
for i, col_name in tqdm(enumerate(y_train.columns)):
    gbc = best_model.get(col_name, None)
    if gbc!=None:
        y_train_pred[:,i] = gbc.predict(X_train)
        y_train_pred_proba[:,i] = gbc.predict_proba(X_train)[:,1]
# overall log_loss
print(log_loss(y_train.values.reshape(-1, 1), y_train_pred_proba.reshape(-1,1)))
# overall precision
print(precision_score(y_train.values.reshape(-1, 1), y_train_pred.reshape(-1,1)))
# overall precision
print(recall_score(y_train.values.reshape(-1, 1), y_train_pred.reshape(-1,1)))

In [None]:
# inference - test
y_pred = np.zeros(y_test.shape).astype('float')
y_pred_proba = np.zeros(y_test.shape).astype('float')
for i, col_name in tqdm(enumerate(y_test.columns)):
    rfc = best_rfc.get(col_name, None)
    if rfc!=None:
        y_pred[:,i] = rfc.predict(X_test)
        y_pred_proba[:,i] = rfc.predict_proba(X_test)[:,1]
# overall log_loss
print(log_loss(y_test.values.reshape(-1, 1), y_pred_proba.reshape(-1,1)))
# overall precision
print(precision_score(y_test.values.reshape(-1, 1), y_pred.reshape(-1,1)))
# overall precision
print(recall_score(y_test.values.reshape(-1, 1), y_pred.reshape(-1,1)))

In [None]:
# recall, f1, for each column
recall = np.zeros(y_train.shape[1])
f1 = np.zeros(y_train.shape[1])
for i in range(y_predict.shape[1]):
    recall[i] = recall_score(y_test.values[:,i], y_pred[:,i])
    f1[i] = f1_score(y_test.values[:,i], y_pred[:,i])
    
# best weight_ratio
weight_ratio = np.ones(y_train.shape[1])*0.72
for i, (col_name, y) in enumerate(y_train.iteritems()):
    ratio[i] = round(len(y)/sum(y)-1)
    param = best_params.get(col_name, None)
    if param != None:
        weight_ratio[i] = round(param['class_weight'][1]/ratio[i], 2)

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(weight_ratio)
plt.plot(recall)

In [None]:
weight_ratio[recall==0]

In [None]:
ratio[recall==0]

### Submission

In [None]:
df_submit = pd.read_csv(data_dir+'test_features.csv', index_col='sig_id')
df_submit['cp_type'].replace({'trt_cp':1., 'ctl_vehicle':0.}, inplace=True)
df_submit['cp_dose'].replace({'D1':1., 'D2':0.}, inplace=True)
df_submit.head()
# inference
X_submit = df_submit.values
y_submit_proba = pd.DataFrame([], columns=y_train.columns)
for i, gbc in tqdm(model_dic_gb.items()):
    y_predict_proba = gbc.predict_proba(X_submit)[:,1].reshape(-1,1)
    y_submit_proba[i] = y_predict_proba
    
y_submit_proba = pd.DataFrame(y_submit_proba)
y_submit_proba.index = df_submit.index
y_submit_proba.head()
y_submit_proba.to_csv('submission.csv', , float_format='%.2f')