In [1]:
import gc
import os
from os.path import exists
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
test = pd.read_csv('../input/lish-moa/test_features.csv')
y_train = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
df = pd.concat([train, test], axis=0)
df = pd.merge(df, y_train, on=['sig_id'], how='left').fillna(-1)
cs_c = ["#c0c", "#0cc", "#cc0", "#777"]
del train, test
gc.collect()

0

In [3]:
i_feats = ['sig_id']
x_feats = ['cp_type', 'cp_time', 'cp_dose']
g_feats = [f'g-{i}' for i in range(770)] # 772
c_feats = [f'c-{i}' for i in range(100)]
labels = y_train.columns.values[1:].tolist()

# シフトを探そう！

## ラベルごとに、各C特徴で相関を比較

In [4]:
def pos_neg_plot(pos, neg, unk, colname, colset, feats, suffix=""):
    feats_ = np.array(feats).reshape((-1,10))
    for batch_idx, feats_batch in enumerate(feats_):
        num_c = len(feats_batch)
        num_s = len(colset)
        fig, axes = plt.subplots(num_c, num_s, figsize=(5,10), sharex=True, sharey=True)
        fig.suptitle("pos(magenta) vs neg(cyan) vs unknown(gray)")
        for j, ci in enumerate(feats_batch):
            print(f'batch {batch_idx:>3} / {len(feats_)}, plotting {j:>3} / {num_c:>3}', end='\r')
            for i, t in enumerate(colset):
                pos[pos[colname] == t][ci].plot(kind='kde', ax=axes[j][i], label="positive", color=cs_c[0])
                neg[neg[colname] == t][ci].plot(kind='kde', ax=axes[j][i], label="negative", color=cs_c[1])
                unk[unk[colname] == t][ci].plot(kind='kde', ax=axes[j][i], label="unknown", color=cs_c[3])
                axes[j][i].grid(True)
            axes[j][0].set_ylabel(ci)
        for i, t in enumerate(colset):
            axes[0][i].set_title(f'{colname}:{t}')
            axes[0][i].set_xlim(-10,10)
            
        dir_ = f"image/pos-neg_feats_by_{colname}_{suffix}"
        if not exists(dir_):
            os.makedirs(dir_)
        plt.savefig(f"{dir_}/{feats_batch[0]}.pdf")
        plt.close()

In [5]:
label = labels[0]
pos = df[df[label] == 1].copy()
neg = df[df[label] == 0].copy()
unk = df[df[label] == -1].copy()

## cp_timeごとのc特徴

In [6]:
cp_times = [24,48,72]
pos_neg_plot(pos, neg, unk, 'cp_time', cp_times, c_feats, suffix=label)

batch   9 / 10, plotting   9 /  10

## cp_doseごとのc特徴

In [7]:
cp_dose = ['D1', 'D2']
pos_neg_plot(pos, neg, unk, 'cp_dose', cp_dose, c_feats, suffix=label)

batch   9 / 10, plotting   9 /  10

## cp_typeごとのc特徴

In [8]:
# posにctl_vehicleはない
# cp_type = ["trt_cp", "ctl_vehicle"]
# pos_neg_plot(pos, neg, unk, 'cp_type', cp_type, c_feats, suffix=label)

## trt_cpのみでcp_timeごとのc特徴、cp_doseごとのc特徴

In [9]:
# train_test_plot(train[train['cp_type']=='trt_cp'], 
#                 test[test['cp_type']=='trt_cp'], 
#                 'cp_time', cp_times, c_feats, suffix="trt_cp")

In [10]:
# train_test_plot(train[train['cp_type']=='trt_cp'], 
#                 test[test['cp_type']=='trt_cp'], 
#                 'cp_dose', cp_doses, c_feats)