# Predicting Biological Response
* Beohringer Ingelheim 의 Kaggle project <br> https://www.kaggle.com/c/bioresponse
* 특정 성분(molecule)의 생물반응(Biological Response)를 예측하는 프로젝트
* logloss가 낮은 모델을 만드는 것이 목표

In [210]:
import pandas as pd # Analysis
import numpy as np # Analysis
from matplotlib import pyplot as plt # Visualize
from xgboost import XGBClassifier # Classification model
from lightgbm import LGBMClassifier # Classification model
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Classification model
from sklearn.svm import SVC # Classification model
from sklearn.decomposition import PCA # Preprocessing
from sklearn.preprocessing import StandardScaler # Preprocessing
from sklearn.feature_selection import SelectFromModel # Preprocessing
from sklearn.metrics import log_loss # Validation
from sklearn.model_selection import GridSearchCV # Validation
from sklearn.cross_validation import train_test_split, KFold, StratifiedKFold # Validation
%matplotlib inline

# Data Import

In [211]:
df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [212]:
df.tail()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
3746,1,0.0333,0.506409,0.1,0.0,0.209887,0.633426,0.297659,0.376124,0.727093,...,0,0,0,0,0,0,0,0,0,0
3747,1,0.133333,0.651023,0.15,0.0,0.151154,0.766505,0.170876,0.404546,0.787935,...,0,0,1,0,1,0,1,0,0,0
3748,0,0.2,0.520564,0.0,0.0,0.179949,0.768785,0.177341,0.471179,0.872241,...,0,0,0,0,0,0,0,0,0,0
3749,1,0.1,0.765646,0.0,0.0,0.536954,0.634936,0.342713,0.447162,0.672689,...,0,0,0,0,0,0,0,0,0,0
3750,0,0.133333,0.533952,0.0,0.0,0.347966,0.757971,0.230667,0.272652,0.854116,...,0,0,0,0,0,0,0,0,0,0


In [213]:
test.tail()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
2496,0.0667,0.658812,0.1,0.0,0.305799,0.614877,0.1809,0.219328,0.617916,0.324679,...,0,0,0,0,0,0,0,0,0,0
2497,0.0333,0.451048,0.0,0.0,0.230019,0.8496,0.114983,0.159589,0.916702,0.0432,...,0,0,0,0,0,0,0,0,0,0
2498,0.0,0.537887,0.15,0.0,0.144312,0.667734,0.283773,0.591918,0.760417,0.275136,...,0,0,0,0,0,0,0,0,0,0
2499,0.0333,0.538504,0.1,0.0,0.191739,0.577244,0.305091,0.554121,0.676559,0.38572,...,0,0,0,0,0,0,0,0,0,0
2500,0.166667,0.648932,0.05,0.0,0.225382,0.619299,0.329329,0.522098,0.704095,0.339546,...,0,0,0,0,0,0,0,0,0,0


* 각 Data Set의 행은 개별 성분을 의미하고, 열은 성분의 특성을 의미한다.
* 열을 구성하는 특성에는 분자의 크기와 모양, 구성요소 등이 포함되나, 이름이 D1~D1776으로 대체되었고 그 값도 정규화 되어있다.
* 변수값은 연속형과 이산형이 혼합되어있다.
* Train Set에는 생물반응 여부가 Activity 열에 표기되어 있다. (없음: 0, 있음: 1)
* Train Set은 3750개의 행으로, Test Set은 2500개의 행으로 구성되어 있다.

In [214]:
df['Activity'].value_counts()

1    2034
0    1717
Name: Activity, dtype: int64

독립변수는 df_X로, 종속변수는 df_y로 분리

In [215]:
df_X = df.iloc[:, 1:].copy()

In [216]:
df_X.head()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,0.243144,...,0,0,0,0,0,0,0,0,0,0
1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,0.10648,...,1,1,1,1,0,1,0,0,1,0
2,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,0.352308,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,0.208989,...,0,0,0,0,0,0,0,0,0,0
4,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,0.125177,...,0,0,0,0,0,0,0,0,0,0


In [217]:
df_y = df.iloc[:, 0].copy()

* row는 3750개로 많지 않으나, column이 1776개로 많다.
* 변수가 너무 많은 것으로 판단되어 PCA를 적용한 데이터셋을 생성 한다.

In [218]:
train_X, test_X, train_y, test_y = train_test_split(df_X, df_y, random_state=1)

# PCA

In [219]:
pca = PCA()
scaler = StandardScaler()

In [220]:
pca.fit(df_X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [221]:
opt_n = pca.explained_variance_ratio_

In [222]:
# 설명력이 90%가 되는 시점의 components 갯수를 출력
def optimal_n(start, ratio, x):
    n = start
    def sum_var(n, ratio, x, comp):
        ratio += comp
        if ratio >= 0.90:
            return '90%: {}, {}'.format(n, ratio)
        else:
            return optimal_n(n, ratio, x)
    comp = x[n]
    n += 1
    return sum_var(n, ratio, x, comp)

In [223]:
optimal_n(0, 0, opt_n)

'90%: 242, 0.9001024831518873'

* PCA fit결과, 242개 차원으로 90%의 설명력을 가질 수 있는 것으로 판단된다.
* 상기 결과는 전체 데이터 셋을 대상으로 하였으므로, 200개로 PCA를 수행.

In [224]:
pca = PCA(n_components=200)
pca.fit(train_X)

PCA(copy=True, iterated_power='auto', n_components=200, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [225]:
pca_train = pca.transform(train_X)
pca_test = pca.transform(test_X)
pca_train = scaler.fit_transform(pca_train)
pca_test = scaler.transform(pca_test)

In [226]:
pca_train.shape

(2813, 200)

In [227]:
train_X.shape

(2813, 1776)

# Model selection
* 모델마다 train set, PCA set을 교차하여 검증

In [228]:
# Cross Validation
def model_cv(train, test, train_y, test_y, model, name):
    model.fit(train, train_y)
    print(name,': ',model.best_params_)
    pred_y = model.predict_proba(test)
    print('train score: {}'.format(model.score(train, train_y)))
    print('test score: {}'.format(model.score(test, test_y)))
    print('log loss: {}'.format(log_loss(test_y, pred_y)))
    print()

In [229]:
# Models 
def forest(train, test, train_y, test_y):
    param = [{'n_estimators':[100, 300, 500, 1000],
         'criterion': ['gini', 'entropy']
         }]
    model = GridSearchCV(RandomForestClassifier(n_jobs=-1), param, cv=3)
    name = 'Random forest'
    return model_cv(train, test, train_y, test_y, model, name)
    
def grbc(train, test, train_y, test_y):
    param = [{'n_estimators': [100, 300, 500],
         'learning_rate': [0.1, 0.05, 0.01]}]
    name = 'Gradient boosting'
    model = GridSearchCV(GradientBoostingClassifier(), param, cv=3)
    return model_cv(train, test, train_y, test_y, model, name)
    
def xgbc(train, test, train_y, test_y):
    param = [{'n_estimators': [100, 500, 1000],
         'learning_rate': [0.1, 0.05, 0.01],}]
    model = GridSearchCV(XGBClassifier(), param, cv=3)
    name = 'XGBoost'
    return model_cv(train, test, train_y, test_y, model, name)

def lgbm(train, test, train_y, test_y):
    param = [{'n_estimators': [100, 500, 1000],
         'learning_rate': [0.1, 0.05, 0.01]}]
    model = GridSearchCV(LGBMClassifier(), param, cv=3)
    name = 'LightGBM'
    return model_cv(train, test, train_y, test_y, model, name)

In [230]:
# Random forest

print('Train set')
forest(train_X, test_X , train_y, test_y)
print('PCA set')
forest(pca_train, pca_test, train_y, test_y)

Train set
Random forest :  {'criterion': 'gini', 'n_estimators': 500}
train score: 1.0
test score: 0.8038379530916845
log loss: 0.4538010010289801

PCA set
Random forest :  {'criterion': 'gini', 'n_estimators': 300}
train score: 1.0
test score: 0.7633262260127932
log loss: 0.5243777752947284



In [231]:
# LightGBM

print('Train set')
lgbm(train_X, test_X , train_y, test_y)
print('PCA set')
lgbm(pca_train, pca_test, train_y, test_y)

Train set
LightGBM :  {'learning_rate': 0.05, 'n_estimators': 100}
train score: 0.9619623178101671
test score: 0.7846481876332623
log loss: 0.4426109369014938

PCA set
LightGBM :  {'learning_rate': 0.01, 'n_estimators': 1000}
train score: 0.9992890152861713
test score: 0.753731343283582
log loss: 0.4849034170739442



In [232]:
# XGBoost

print('Train set')
xgbc(train_X, test_X , train_y, test_y)
print('PCA set')
xgbc(pca_train, pca_test, train_y, test_y)

Train set
XGBoost :  {'learning_rate': 0.05, 'n_estimators': 1000}
train score: 0.9804479203697121
test score: 0.7921108742004265
log loss: 0.454485802923633

PCA set
XGBoost :  {'learning_rate': 0.05, 'n_estimators': 500}
train score: 0.9619623178101671
test score: 0.7484008528784648
log loss: 0.500959698477192



In [233]:
# Gradient Boosting

print('Train set')
grbc(train_X, test_X , train_y, test_y)
print('PCA set')
grbc(pca_train, pca_test, train_y, test_y)

Train set
Gradient boosting :  {'learning_rate': 0.1, 'n_estimators': 500}
train score: 0.9836473515819409
test score: 0.7963752665245203
log loss: 0.44931991833743673

PCA set
Gradient boosting :  {'learning_rate': 0.1, 'n_estimators': 300}
train score: 0.9840028439388553
test score: 0.7420042643923241
log loss: 0.500236958650612



# PCA is not an answer
* PCA를 적용한 set의 결과가 더 좋지 않았음.
* 변수들의 다중공선성이 낮은 것으로 판단된다.
* feature importance를 기준으로 중요한 변수만 활용해보는 방안.

In [234]:
# Random forest를 통해 feature importance 확인
rf = RandomForestClassifier(criterion='entropy', n_estimators=300, random_state=0)
rf.fit(train_X, train_y)
rf.feature_importances_

array([  2.26847869e-03,   7.11628201e-03,   2.51183441e-03, ...,
         3.18774254e-05,   9.24437181e-06,   3.23754397e-05])

In [235]:
model = SelectFromModel(rf, prefit=True)
feature_idx = model.get_support()
feature_name = train_X.columns[feature_idx]
train_X = model.transform(train_X)
test_X = model.transform(test_X)

In [236]:
train_X = pd.DataFrame(train_X, columns=feature_name)
test_X = pd.DataFrame(test_X, columns=feature_name)

* SelectFromModel을 이용, 375개의 변수를 기준으로 새로운 Dataframe 생성함.

In [237]:
forest(train_X, test_X , train_y, test_y)
lgbm(train_X, test_X , train_y, test_y)
xgbc(train_X, test_X , train_y, test_y)
grbc(train_X, test_X , train_y, test_y)

Random forest :  {'criterion': 'entropy', 'n_estimators': 500}
train score: 1.0
test score: 0.8017057569296375
log loss: 0.45014419393731336

LightGBM :  {'learning_rate': 0.01, 'n_estimators': 500}
train score: 0.9580519018841095
test score: 0.7889125799573561
log loss: 0.4449989530545316

XGBoost :  {'learning_rate': 0.05, 'n_estimators': 1000}
train score: 0.9808034127266264
test score: 0.7835820895522388
log loss: 0.4664072721561383

Gradient boosting :  {'learning_rate': 0.05, 'n_estimators': 500}
train score: 0.9353003910415926
test score: 0.7921108742004265
log loss: 0.454689994012779



* 375개의 변수로 동일한 성능을 발휘하는 것을 확인할 수 있었음.
* log loss가 가장 낮은 LightGBM을 최종 모델로 선정, 제출하고자 함

In [238]:
forest = RandomForestClassifier(criterion='entropy', n_estimators=300)
lgbm = LGBMClassifier(learning_rate=0.01, n_estimators=500)
xgbc = XGBClassifier(learning_rate=0.05, n_estimators=1000)
grbc = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500)

In [239]:
forest.fit(train_X, train_y)
lgbm.fit(train_X, train_y)
xgbc.fit(train_X, train_y)
grbc.fit(train_X, train_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

# Submit

In [240]:
test = pd.read_csv('data/test.csv')

In [241]:
test.tail()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
2496,0.0667,0.658812,0.1,0.0,0.305799,0.614877,0.1809,0.219328,0.617916,0.324679,...,0,0,0,0,0,0,0,0,0,0
2497,0.0333,0.451048,0.0,0.0,0.230019,0.8496,0.114983,0.159589,0.916702,0.0432,...,0,0,0,0,0,0,0,0,0,0
2498,0.0,0.537887,0.15,0.0,0.144312,0.667734,0.283773,0.591918,0.760417,0.275136,...,0,0,0,0,0,0,0,0,0,0
2499,0.0333,0.538504,0.1,0.0,0.191739,0.577244,0.305091,0.554121,0.676559,0.38572,...,0,0,0,0,0,0,0,0,0,0
2500,0.166667,0.648932,0.05,0.0,0.225382,0.619299,0.329329,0.522098,0.704095,0.339546,...,0,0,0,0,0,0,0,0,0,0


In [242]:
tf = test.copy()

In [243]:
tf = model.transform(tf)

In [244]:
pred = lgbm.predict_proba(tf)[:,1]

In [245]:
submit = pd.DataFrame([{"MoleculeId": (i+1), "PredictedProbability": p} for i, p in enumerate(pred)])

In [246]:
submit.head()

Unnamed: 0,MoleculeId,PredictedProbability
0,1,0.712912
1,2,0.960323
2,3,0.519419
3,4,0.950301
4,5,0.0962


In [247]:
submit.to_csv('data/submit_lgbm0810.csv', index=False)

* LightGBM 제출 결과, Public Score 0.44096으로 168위에 해당하는 점수를 얻었음