In [13]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from tqdm import tqdm

In [2]:
def process_df(df):
    df.loc[:, 'cp_time'] = df.loc[:, 'cp_time'].map({24: -1, 48: 0, 72: 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    # df = df.drop('cp_type', axis=1)
    return df

In [3]:
train_features = process_df(pd.read_csv('../data/train_features.csv'))
train_targets = pd.read_csv('../data/train_targets_scored.csv')
train_targets_ns = pd.read_csv('../data/train_targets_nonscored.csv')
test_features = process_df(pd.read_csv('../data/test_features.csv'))

In [4]:
TARGETS = train_targets.columns[1:]
G_FEATURES = [c for c in train_features.columns if 'g-' in c]
C_FEATURES = [c for c in train_features.columns if 'c-' in c]
FEATURES = G_FEATURES + C_FEATURES

In [5]:
SEED = 42

In [6]:
skf = StratifiedKFold(n_splits=5)

## Feature Selection

In [16]:
all_data = []
for target in TARGETS:
    t0 = time.time()
    feat_importance = []
    for feature in tqdm(FEATURES):
        x = train_features[['cp_time', 'cp_dose', feature]].values
        y = train_targets[target].values
        score = []
        for train_idx, valid_idx in skf.split(x, y):
            train_x, train_y = x[train_idx], y[train_idx]
            valid_x, valid_y = x[valid_idx], y[valid_idx]
            model = LogisticRegression(max_iter=10000, tol=0.1, C=0.5, verbose=0, random_state=SEED)
            model.fit(train_x, train_y)
            valid_pred = model.predict_proba(valid_x)
            score.append(log_loss(valid_y, valid_pred))
            continue
        score = np.mean(score)
        feat_importance.append(score)
    all_data.append(feat_importance)
    t1 = time.time()
    print('-'*50)
    print(target, np.min(feat_importance))
    print(t0-t1)
    print('-'*50)

 78%|███████▊  | 680/872 [03:14<00:55,  3.49it/s]


KeyboardInterrupt: 

In [29]:
target = '5-alpha_reductase_inhibitor'
features = ['g-407', 'g-278', 'g-235']
x = train_features[['cp_time', 'cp_dose'] + features].values
y = train_targets[target].values

score = []
for train_idx, valid_idx in skf.split(x, y):
    train_x, train_y = x[train_idx], y[train_idx]
    valid_x, valid_y = x[valid_idx], y[valid_idx]
    model = LogisticRegression(max_iter=10000, tol=0.1, C=0.5, verbose=0, random_state=SEED)
    model.fit(train_x, train_y)
    valid_pred = model.predict_proba(valid_x)
    score.append(log_loss(valid_y, valid_pred))
score = np.mean(score)

print(score)

0.005330493200559128
