In [None]:
import gc
import os
from os.path import exists
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
test = pd.read_csv('../input/lish-moa/test_features.csv')
train['part']='train'
test['part']='test'
# df = pd.concat([tr, te], axis=0)
cs_c = ["#c0c", "#0cc", "#cc0", "#000"]
# del tr, te
gc.collect()

0

In [3]:
i_feats = ['sig_id']
x_feats = ['cp_type', 'cp_time', 'cp_dose']
g_feats = [f'g-{i}' for i in range(770)] # 772
c_feats = [f'c-{i}' for i in range(100)]

## cp_timeごとに、各C特徴で訓練/テストの分布を比較

In [13]:
def train_test_plot(tr, te, colname, colset, feats, suffix=""):
    feats_ = np.array(feats).reshape((-1,10))
    for batch_idx, feats_batch in enumerate(feats_):
        num_c = len(feats_batch)
        num_s = len(colset)
        fig, axes = plt.subplots(num_c, num_s, figsize=(5,10), sharex=True, sharey=True)
        fig.suptitle("train(magenta) vs test(yellow)")
        for j, ci in enumerate(feats_batch):
            print(f'batch {batch_idx:>3} / {len(feats_)}, plotting {j:>3} / {num_c:>3}', end='\r')
            for i, t in enumerate(colset):
                tr[tr[colname] == t][ci].plot(kind='kde', ax=axes[j][i], label="train", color=cs_c[0])
                te[te[colname] == t][ci].plot(kind='kde', ax=axes[j][i], label="test", color=cs_c[2])
                axes[j][i].grid(True)
            axes[j][0].set_ylabel(ci)
        for i, t in enumerate(colset):
            axes[0][i].set_title(f'time: {t}')
            axes[0][i].set_xlim(-10,10)
            
        dir_ = f"image/tr-te_feats_by_{colname}_{suffix}"
        if not exists(dir_):
            os.makedirs(dir_)
        plt.savefig(f"{dir_}/{feats_batch[0]}.pdf")
        plt.close()

## cp_timeごとのc特徴

In [15]:
cp_times = [24,48,72]
train_test_plot(train, test, 'cp_time', cp_times, c_feats)

batch   9 / 10, plotting   9 /  10

## cp_doseごとのc特徴

In [10]:
cp_dose = ['D1', 'D2']
train_test_plot(train, test, 'cp_dose', cp_dose, c_feats)

batch   9 / 10, plotting   9 /  10

## cp_typeごとのc特徴

In [11]:
cp_type = ["trt_cp", "ctl_vehicle"]
train_test_plot(train, test, 'cp_type', cp_type, c_feats)

batch   9 / 10, plotting   9 /  10

## trt_cpのみでcp_timeごとのc特徴、cp_doseごとのc特徴

In [14]:
train_test_plot(train[train['cp_type']=='trt_cp'], 
                test[test['cp_type']=='trt_cp'], 
                'cp_time', cp_times, c_feats, suffix="trt_cp")

batch   9 / 10, plotting   9 /  10

In [19]:
train_test_plot(train[train['cp_type']=='trt_cp'], 
                test[test['cp_type']=='trt_cp'], 
                'cp_dose', cp_dose, c_feats, suffix="trt_cp")

batch   9 / 10, plotting   9 /  10

---

## PCAによる特徴の再現性検証

In [47]:
from sklearn.decomposition import PCA
for i in range(3,30):
    pca = PCA(n_components=i)
    g_pca = pca.fit_transform(train[c_feats])
    coverage = pca.explained_variance_ratio_.sum()
    print(f"{i:>3}:{coverage:<.3}")
    if coverage >= .9:
        break

  3:0.868
  4:0.874
  5:0.878
  6:0.882
  7:0.885
  8:0.889
  9:0.892
 10:0.895
 11:0.897
 12:0.9
 13:0.902
