# Predicting Biological Response
* Beohringer Ingelheim 의 Kaggle project <br> https://www.kaggle.com/c/bioresponse
* 특정 성분(molecule)의 생물반응(Biological Response)를 예측하는 프로젝트
* logloss가 낮은 모델을 만드는 것이 목표

In [100]:
import pandas as pd # Analysis
import numpy as np # Analysis
from matplotlib import pyplot as plt # Visualize
%matplotlib inline

# Data Import

In [101]:
df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [102]:
df.tail()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
3746,1,0.0333,0.506409,0.1,0.0,0.209887,0.633426,0.297659,0.376124,0.727093,...,0,0,0,0,0,0,0,0,0,0
3747,1,0.133333,0.651023,0.15,0.0,0.151154,0.766505,0.170876,0.404546,0.787935,...,0,0,1,0,1,0,1,0,0,0
3748,0,0.2,0.520564,0.0,0.0,0.179949,0.768785,0.177341,0.471179,0.872241,...,0,0,0,0,0,0,0,0,0,0
3749,1,0.1,0.765646,0.0,0.0,0.536954,0.634936,0.342713,0.447162,0.672689,...,0,0,0,0,0,0,0,0,0,0
3750,0,0.133333,0.533952,0.0,0.0,0.347966,0.757971,0.230667,0.272652,0.854116,...,0,0,0,0,0,0,0,0,0,0


In [103]:
test.tail()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
2496,0.0667,0.658812,0.1,0.0,0.305799,0.614877,0.1809,0.219328,0.617916,0.324679,...,0,0,0,0,0,0,0,0,0,0
2497,0.0333,0.451048,0.0,0.0,0.230019,0.8496,0.114983,0.159589,0.916702,0.0432,...,0,0,0,0,0,0,0,0,0,0
2498,0.0,0.537887,0.15,0.0,0.144312,0.667734,0.283773,0.591918,0.760417,0.275136,...,0,0,0,0,0,0,0,0,0,0
2499,0.0333,0.538504,0.1,0.0,0.191739,0.577244,0.305091,0.554121,0.676559,0.38572,...,0,0,0,0,0,0,0,0,0,0
2500,0.166667,0.648932,0.05,0.0,0.225382,0.619299,0.329329,0.522098,0.704095,0.339546,...,0,0,0,0,0,0,0,0,0,0


* 각 Data Set의 행은 개별 성분을 의미하고, 열은 성분의 특성을 의미한다.
* 열을 구성하는 특성에는 분자의 크기와 모양, 구성요소 등이 포함되나, 이름이 D1~D1776으로 대체되었고 그 값도 정규화 되어있다.
* 변수값은 연속형과 이산형이 혼합되어있다.
* Train Set에는 생물반응 여부가 Activity 열에 표기되어 있다. (없음: 0, 있음: 1)
* Train Set은 3750개의 행으로, Test Set은 2500개의 행으로 구성되어 있다.

In [104]:
df['Activity'].value_counts()

1    2034
0    1717
Name: Activity, dtype: int64

독립변수는 df_X로, 종속변수는 df_y로 분리

In [105]:
df_X = df.iloc[:, 1:]

In [106]:
df_X.head()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,0.243144,...,0,0,0,0,0,0,0,0,0,0
1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,0.10648,...,1,1,1,1,0,1,0,0,1,0
2,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,0.352308,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,0.208989,...,0,0,0,0,0,0,0,0,0,0
4,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,0.125177,...,0,0,0,0,0,0,0,0,0,0


In [107]:
df_y = df.iloc[:, :1]
df_y.head()

Unnamed: 0,Activity
0,1
1,1
2,1
3,1
4,0


# PCA

In [108]:
from sklearn.decomposition import PCA
pca = PCA()

In [109]:
pca.fit(df_X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [110]:
opt_n = pca.explained_variance_ratio_

In [111]:
def optimal_n(start, ratio, x):
    n = start
    def sum_var(n, ratio, x, comp):
        ratio += comp
        if ratio >= 0.99:
            return n, ratio
        else:
            return optimal_n(n, ratio, x)
    comp = x[n]
    n += 1
    return sum_var(n, ratio, x, comp)

In [112]:
optimal_n(0, 0, opt_n)

(583, 0.99005698994111968)

In [113]:
opt_pca = PCA(n_components=583)

In [114]:
opt_pca.fit(df_X)

PCA(copy=True, iterated_power='auto', n_components=583, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [115]:
df_X = opt_pca.transform(df_X)

In [117]:
df_X.shape

(3751, 583)

In [116]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(df_X, df_y)

# Random forest

In [118]:
from sklearn.ensemble import RandomForestClassifier

In [119]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [120]:
param = [{'n_estimators':[100, 300, 500, 1000],
         'criterion': ['gini', 'entropy']
         }]

In [121]:
model = GridSearchCV(RandomForestClassifier(n_jobs=-1), param)
model.fit(train_X, train_y.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [100, 300, 500, 1000], 'criterion': ['gini', 'entropy']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [122]:
model.best_params_

{'criterion': 'entropy', 'n_estimators': 500}

In [123]:
for mean, std, params in zip(model.cv_results_['mean_test_score'], model.cv_results_['std_test_score'], model.cv_results_['params']):
    print("{mean} (+/- {double_std}) for {param}".format(mean=mean, double_std=2*std, param=params))

0.7237824386775684 (+/- 0.041960069255322044) for {'criterion': 'gini', 'n_estimators': 100}
0.7358691788126556 (+/- 0.025964139853554526) for {'criterion': 'gini', 'n_estimators': 300}
0.7383576253110559 (+/- 0.03600358478684624) for {'criterion': 'gini', 'n_estimators': 500}
0.745111980092428 (+/- 0.01915662524816848) for {'criterion': 'gini', 'n_estimators': 1000}
0.7355136864557412 (+/- 0.01986666527258593) for {'criterion': 'entropy', 'n_estimators': 100}
0.7387131176679701 (+/- 0.02260038296578013) for {'criterion': 'entropy', 'n_estimators': 300}
0.7465339495200853 (+/- 0.0018961964108231448) for {'criterion': 'entropy', 'n_estimators': 500}
0.740846071809456 (+/- 0.01370853256787007) for {'criterion': 'entropy', 'n_estimators': 1000}


In [124]:
predicted = model.predict(test_X)

In [125]:
mean_squared_error(test_y, predicted)

0.25906183368869934

# XGBoost

In [126]:
from xgboost import XGBClassifier

In [127]:
param = [{'n_estimators': [100, 500, 1000],
         'learning_rate': [0.1, 0.05, 0.01],}]

In [128]:
model = GridSearchCV(XGBClassifier(), param)
model.fit(train_X, train_y.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [100, 500, 1000], 'learning_rate': [0.1, 0.05, 0.01]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [129]:
model.best_params_

{'learning_rate': 0.01, 'n_estimators': 1000}

In [130]:
for mean, std, params in zip(model.cv_results_['mean_test_score'], model.cv_results_['std_test_score'], model.cv_results_['params']):
    print("{mean} (+/- {double_std}) for {param}".format(mean=mean, double_std=2*std, param=params))

0.7419125488801991 (+/- 0.016541398304126754) for {'learning_rate': 0.1, 'n_estimators': 100}
0.746178457163171 (+/- 0.016406392030499133) for {'learning_rate': 0.1, 'n_estimators': 500}
0.7454674724493423 (+/- 0.014972902517215055) for {'learning_rate': 0.1, 'n_estimators': 1000}
0.7326697476004266 (+/- 0.026217945615108387) for {'learning_rate': 0.05, 'n_estimators': 100}
0.745111980092428 (+/- 0.00879463660490349) for {'learning_rate': 0.05, 'n_estimators': 500}
0.7454674724493423 (+/- 0.0018957720247784673) for {'learning_rate': 0.05, 'n_estimators': 1000}
0.7092072520440811 (+/- 0.03840869250702781) for {'learning_rate': 0.01, 'n_estimators': 100}
0.7355136864557412 (+/- 0.019618152625312368) for {'learning_rate': 0.01, 'n_estimators': 500}
0.7479559189477426 (+/- 0.008444514249690619) for {'learning_rate': 0.01, 'n_estimators': 1000}


In [131]:
predicted = model.predict(test_X)

In [132]:
mean_squared_error(test_y, predicted)

0.27611940298507465

# Gradient Boosting

In [133]:
from sklearn.ensemble import GradientBoostingClassifier

In [134]:
param = [{'n_estimators': [100, 500, 1000],
         'learning_rate': [0.1, 0.05, 0.01]}]

In [135]:
model = GridSearchCV(GradientBoostingClassifier(), param)
model.fit(train_X, train_y.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [100, 500, 1000], 'learning_rate': [0.1, 0.05, 0.01]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [136]:
model.best_params_

{'learning_rate': 0.1, 'n_estimators': 1000}

In [137]:
for mean, std, params in zip(model.cv_results_['mean_test_score'], model.cv_results_['std_test_score'], model.cv_results_['params']):
    print("{mean} (+/- {double_std}) for {param}".format(mean=mean, double_std=2*std, param=params))

0.7369356558833985 (+/- 0.015464899335027614) for {'learning_rate': 0.1, 'n_estimators': 100}
0.7454674724493423 (+/- 0.011105860703642417) for {'learning_rate': 0.1, 'n_estimators': 500}
0.7522218272307145 (+/- 0.013939284340494512) for {'learning_rate': 0.1, 'n_estimators': 1000}
0.7362246711695698 (+/- 0.018882624283499693) for {'learning_rate': 0.05, 'n_estimators': 100}
0.7486669036615713 (+/- 0.012026081856682814) for {'learning_rate': 0.05, 'n_estimators': 500}
0.7426235335940278 (+/- 0.018427392629558812) for {'learning_rate': 0.05, 'n_estimators': 1000}
0.7052968361180235 (+/- 0.03268566598251971) for {'learning_rate': 0.01, 'n_estimators': 100}
0.730892285815855 (+/- 0.01595268882441531) for {'learning_rate': 0.01, 'n_estimators': 500}
0.7447564877355137 (+/- 0.004360054144459657) for {'learning_rate': 0.01, 'n_estimators': 1000}


In [138]:
predicted = model.predict(test_X)

In [139]:
mean_squared_error(test_y, predicted)

0.2462686567164179

# SVM

In [286]:
from sklearn.svm import SVC

In [287]:
param = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
         'C': [1, 10, 100, 1000]},
        {'kernel':['linear'], 'C': [1, 10, 100, 1000]}]

In [None]:
model = GridSearchCV(SVC(), param)
model.fit(train_X, train_y.values.ravel())

In [None]:
model.best_params_

In [None]:
for mean, std, params in zip(model.cv_results_['mean_test_score'], model.cv_results_['std_test_score'], model.cv_results_['params']):
    print("{mean} (+/- {double_std}) for {param}".format(mean=mean, double_std=2*std, param=params))

In [None]:
predicted = model.predict(test_X)

In [None]:
mean_squared_error(test_y, predicted)

In [204]:
df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
df_X = df.iloc[:, 1:]

# Validation test set

In [140]:
forest = RandomForestClassifier(criterion='entropy', n_estimators=500, n_jobs=-1)

In [141]:
forest.fit(train_X, train_y)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [142]:
print('train: {}'.format(forest.score(train_X, train_y)))
print('test: {}'.format(forest.score(test_X, test_y)))

train: 1.0
test: 0.7484008528784648


In [143]:
xgbc = XGBClassifier(learning_rate=0.01, n_estimators=1000)

In [144]:
xgbc.fit(train_X, train_y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [145]:
print('train: {}'.format(xgbc.score(train_X, train_y)))
print('test: {}'.format(xgbc.score(test_X, test_y)))

train: 0.919303234980448
test: 0.7238805970149254


# Submit

In [31]:
test = pd.read_csv('data/test.csv')

In [32]:
opt_pca.fit(test)

PCA(copy=True, iterated_power='auto', n_components=242, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [33]:
test = opt_pca.transform(test)

In [34]:
pred1 = forest.predict_proba(test)

In [35]:
submit = pd.DataFrame(pred1, columns=xgbc.classes_)

In [36]:
submit.index += 1

In [37]:
submit.tail()

Unnamed: 0,0,1
2497,0.6,0.4
2498,0.604,0.396
2499,0.438,0.562
2500,0.45,0.55
2501,0.47,0.53


In [39]:
submit.to_csv('data/submit_forest.csv')