In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import log_loss, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

import time
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait, ALL_COMPLETED, as_completed
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

data_dir = './DATA/lish-moa/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


***
# Table of Contents   
1. Load the dataset  
2. Preprocessing   
3. Models : Binary Relevance (Logistic Regression)  
***

In [7]:
X = pd.read_csv(data_dir+'train_features.csv', index_col='sig_id')
y = pd.read_csv(data_dir+'train_targets_scored.csv', index_col='sig_id')

In [8]:
# One-hot encoding for cp_type and cp_dose
X['cp_type'].replace({'trt_cp':1., 'ctl_vehicle':0.}, inplace=True)
X['cp_dose'].replace({'D1':1., 'D2':0.}, inplace=True)

# split into training set and test set
SEED = 123
np.random.seed(SEED)

ids = X.index.values.copy()
np.random.shuffle(ids)

train_perc, test_perc = 0.85, 0.2
train_id = ids[:round(len(ids)*train_perc)]
test_id = ids[round(len(ids)*train_perc):]

X_train = X.loc[train_id]
X_test = X.loc[test_id]

y_train = y.loc[train_id]
y_test = y.loc[test_id]

# normalize the data
scaler = StandardScaler()
X_train_norm = pd.DataFrame(scaler.fit_transform(X_train))
X_train_norm.columns = X_train.columns
X_train_norm.index = X_train.index

X_test_norm = pd.DataFrame(scaler.transform(X_test))
X_test_norm.columns = X_test.columns
X_test_norm.index = X_test.index



In [22]:
# 
pca = PCA(n_components=700)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train_norm))
X_train_pca.index = X_train.index

X_test_pca = pd.DataFrame(pca.transform(X_test_norm))
X_test_pca.index = X_test.index

## Model

In [23]:
weight_ratio = 0.075
penalty = 'l1'
C = 1

current = 0
start = 0
model_dic_lr = {}

def calculate_model(col_name, y):
    global model_dic_lr
    global current
    global start
    
    start += 1
    start_time = time.process_time()
#     print("Start " + str(start) + " ")
    
    X = X_train_pca.values# train
    # class_weight for each target column
    class_weight = {0:1, 1:min(round(len(y)/sum(y)-1)*weight_ratio, 8000)}
    
    
    lr = LogisticRegression(penalty = penalty,
                            C = C,
                            class_weight = class_weight,
                            n_jobs = -1,
                            solver = 'saga',
                            random_state = 100)
    
    lr.fit(X, y);
    model_dic_lr[col_name] = lr
    current += 1
#     print("--- Completed %s, %s, %.4f mins ---" % (current, col_name, (time.process_time() - start_time) / 60))
    return col_name

# with ThreadPoolExecutor(max_workers=4) as t:
#     futures = []
#     for col_name, y in y_train.iteritems():
#         futures.append(t.submit(calculate_model, col_name, y))

#     for future in as_completed(futures):
#         print(future.result())
    
for col_name, y in tqdm(y_train.iteritems()):
        calculate_model(col_name, y);

206it [4:10:29, 72.96s/it]


In [26]:
y_train_pred = pd.DataFrame([], columns=y_train.columns)
y_test_pred = pd.DataFrame([], columns=y_train.columns)
X = X_train_pca.values
X2 = X_test_pca.values
for col_name, lr in tqdm(model_dic_lr.items()):
    y_train_pred[col_name] = lr.predict(X)
    y_test_pred[col_name] = lr.predict(X2)

100%|██████████| 206/206 [01:07<00:00,  3.05it/s]


In [27]:
# binary relavance
print('weight_ratio:',weight_ratio, ' penalty:',penalty, ' C:', C, ' pca:', n_components)
print('Precision: ')
print(precision_score(y_train.values.reshape(-1, 1), y_train_pred.values.reshape(-1, 1)))
print('Recall: ')
print(recall_score(y_train.values.reshape(-1, 1), y_train_pred.values.reshape(-1, 1)))
print('F1: ')
print(f1_score(y_train.values.reshape(-1, 1), y_train_pred.values.reshape(-1, 1)))
print()
print('Precision: ')
print(precision_score(y_test.values.reshape(-1, 1), y_test_pred.values.reshape(-1, 1)))
print('Recall: ')
print(recall_score(y_test.values.reshape(-1, 1), y_test_pred.values.reshape(-1, 1)))
print('F1: ')
print(f1_score(y_test.values.reshape(-1, 1), y_test_pred.values.reshape(-1, 1)))
print()
print(y_train.sum().sum())
print(y_train_pred.sum().sum())
print()
print(y_test.sum().sum())
print(y_test_pred.sum().sum())

weight_ratio: 0.075  penalty: l1  C: 1  pca: 400
Precision: 
0.6452129362792187
Recall: 
0.703856364398491
F1: 
0.6732600487821178

Precision: 
0.29971181556195964
Recall: 
0.28774703557312253
F1: 
0.2936075821738253

14314
15615

2530
2429


### Cross validation

In [None]:
folds = 5

current = 0
start = 0
best_model = {}
best_params = {}
X = X_train_norm.values
def search_model(i, col_name, y):
    global best_model
    global best_params
    global current
    global start
    
    start += 1
    start_time = time.process_time()
    print("Start " + str(start) + " ")
    
#     pca = PCA()
    lr = LogisticRegression(n_jobs = -1,
                            solver = 'saga',
                            random_state = 100)
#     pipe = Pipeline(steps=[('pca',pca), ('logistic',lr)])

    ratio = round(len(y)/sum(y)-1)
    tuned_params = {
        'class_weight':[{0:1, 1:ratio*weight_ratio} for weight_ratio in [0.05, 0.075, 0.1]]
        'penalty':['l1', 'l2'],
        'C':[0.1, 0.2, 0.5, 1, 2]
    }


    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 200)
    search = RandomizedSearchCV(estimator = lr, param_distributions=tuned_params,
                               cv = skf.split(X, y), scoring='neg_log_loss', n_iter = 20,
                               random_state=100)
    search.fit(X, y)
    best_model[col_name] = search.best_estimator_
    best_params[col_name] = search.best_params_
    
    joblib.dump(best_model[col_name], f'./TrainedModels/LogRegression/lr_{i}.joblib')
    current += 1
    print("--- Completed %s, %s, %.4f mins ---" % (current, col_name, (time.process_time() - start_time) / 60))
    
with ThreadPoolExecutor(max_workers=4) as t:
    futures = []
    for i, (col_name, y) in enumerate(y_train.iteritems()):
        futures.append(t.submit(search_model, i, col_name, y))

    for future in as_completed(futures):
        print(future.result())

In [None]:
# inference - training
y_train_pred = np.zeros(y_train.shape).astype('float')
y_train_pred_proba = np.zeros(y_train.shape).astype('float')
for i, col_name in tqdm(enumerate(y_train.columns)):
    lr = best_model.get(col_name, None)
    if gbc!=None:
        y_train_pred[:,i] = lr.predict(X_train_norm)
        y_train_pred_proba[:,i] = lr.predict_proba(X_trai_norm)[:,1]
# overall log_loss
print(log_loss(y_train.values.reshape(-1, 1), y_train_pred_proba.reshape(-1,1)))
# overall precision
print(precision_score(y_train.values.reshape(-1, 1), y_train_pred.reshape(-1,1)))
# overall precision
print(recall_score(y_train.values.reshape(-1, 1), y_train_pred.reshape(-1,1)))

In [None]:
lr# inference - test
y_pred = np.zeros(y_test.shape).astype('float')
y_pred_proba = np.zeros(y_test.shape).astype('float')
for i, col_name in tqdm(enumerate(y_test.columns)):
    lr = best_model.get(col_name, None)
    if rfc!=None:
        y_pred[:,i] = lr.predict(X_test_norm)
        y_pred_proba[:,i] = lr.predict_proba(X_test_norm)[:,1]
# overall log_loss
print(log_loss(y_test.values.reshape(-1, 1), y_pred_proba.reshape(-1,1)))
# overall precision
print(precision_score(y_test.values.reshape(-1, 1), y_pred.reshape(-1,1)))
# overall precision
print(recall_score(y_test.values.reshape(-1, 1), y_pred.reshape(-1,1)))