In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from sklearn import model_selection
from sklearn.decomposition import PCA
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from joblib import dump, load

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128
NFOLDS = 5
hidden_size=1400
data_dir = 'D:\\Dataset\\MOA\\'

In [2]:
import random
import numpy as np
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [3]:
def read_data(train_file, test_file, target_file, path=data_dir):
    
    df_train=pd.read_csv(os.path.join(path, train_file))
    df_test=pd.read_csv(os.path.join(path, test_file))
    df_target=pd.read_csv(os.path.join(path, target_file))
    
    print(df_train.shape, df_test.shape, df_target.shape, df_test.shape)
    
    return df_train, df_test, df_target, df_test

In [4]:
train_features, test_features, train_targets_scored, test_final = read_data('train_features.csv', 'test_features.csv', 'train_targets_scored.csv')

(23814, 876) (3982, 876) (23814, 207) (3982, 876)


In [5]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [6]:
# GENES
n_comp = 100

cells_genes = load('std_scaler_genes.bin')

test2 = cells_genes.transform(test_features[GENES])

test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
test_features = pd.concat((test_features, test2), axis=1)

In [7]:
#CELLS
n_comp = 15

cells_pca = load('std_scaler_cells.bin')


test2 = cells_pca.transform(test_features[CELLS])

test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]

test_features = pd.concat((test_features, test2), axis=1)

In [8]:

variance = load('Variance_transform.bin')

data_transformed = variance.transform(test_features.iloc[:, 4:])



test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                            columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(data_transformed)], axis=1)

In [9]:
def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    
    return data

In [10]:
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test.drop('cp_type', axis=1)

In [11]:
target_cols = load('target_cols.bin')
feature_cols = load('feature_cols.bin')
num_features=len(feature_cols)
num_targets=len(target_cols)

In [12]:
len(feature_cols)

982

In [13]:
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

In [14]:
def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

In [15]:
class GaussianNoise(nn.Module):

    def __init__(self, sigma=0.5, is_relative_detach=True):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        self.register_buffer('noise', torch.tensor(0))

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            sampled_noise = self.noise.expand(*x.size()).float().normal_() * scale
            x = x + sampled_noise
        return x 

In [16]:
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size, dropout):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(dropout)
        self.dense1 = nn.Linear(num_features, hidden_size)
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(dropout)
        self.dense2 = nn.Linear(hidden_size, hidden_size)
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(dropout)
        self.dense3 = nn.Linear(hidden_size, num_targets)
        #bias_ = torch.Tensor(bias)
        #print(self.dense3.bias.data)
        #nn.init.xavier_normal_(self.dense3.weight)
        #self.dense3.bias = torch.nn.Parameter(_bias)
        self.gausian = GaussianNoise()
        #self.relu=nn.PReLU()


    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        #x = self.gausian(x)
        x = F.relu(self.dense1(x))
        x = self.gausian(x)
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [17]:
def run_inference(fold, seed):
    seed_everything(seed)
    
    test_ = process_data(test)
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features = num_features,
        num_targets = num_targets,
        hidden_size = 1400,
        dropout = 0.10721077648722396
    )
    
    model.load_state_dict(torch.load(f"C:\\Users\\acer\\5foldbest_model_{fold}_{seed}.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test_), train_targets_scored.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return predictions

In [18]:
def run_k_fold(NFOLDS, seed):
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        pred_ = run_inference(fold, seed)
        
        predictions += pred_ / NFOLDS
        
    return predictions

In [19]:
NFOLDS = 5
SEED = [0, 1, 2, 3, 4, 5]
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    predictions_ = run_k_fold(NFOLDS, seed)
    predictions += predictions_ / len(SEED)
    
for i in target_cols:
    test[i]=0.
test[target_cols] = predictions

In [25]:
s = np.zeros((len(test), len(target_cols)))
s.shape

(3624, 206)

In [27]:
predictions1 = np.load('model_01840.npy')
predictions2 = np.load('MoA_Model_01844.npy')
predictions3 = np.load('MoA_Model_01858.npy')

In [28]:
predictions3.shape

(3624, 206)

In [24]:
test[target_cols] = (predictions1 + predictions2 + predictions3)/3

In [25]:
sample_submission = pd.read_csv(data_dir+'sample_submission.csv')
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub[target_cols] = sub[target_cols]
sub.to_csv('submission_04_11_2020_v2.csv',index=False)

In [72]:
pd.read_csv('submission_04_11_2020_v2.csv')

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001067,0.001489,0.003245,0.014741,0.019685,0.005112,0.002076,0.007545,0.000410,...,0.001131,0.002354,0.005651,0.001514,0.000914,0.000893,0.000935,0.002105,0.012881,0.001739
1,id_001897cda,0.000642,0.001008,0.002189,0.002892,0.001563,0.002115,0.004489,0.010103,0.005707,...,0.001091,0.001199,0.005310,0.000446,0.008168,0.000693,0.006653,0.001270,0.001200,0.002642
2,id_002429b5b,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,id_00276f245,0.001167,0.001284,0.002189,0.011902,0.016398,0.005296,0.003828,0.004723,0.000474,...,0.000890,0.001514,0.003017,0.014083,0.006298,0.000926,0.002061,0.002405,0.000620,0.002613
4,id_0027f1083,0.001768,0.001930,0.002055,0.017793,0.018539,0.004153,0.005446,0.002343,0.000703,...,0.001164,0.000817,0.003894,0.001481,0.001626,0.001054,0.001665,0.002277,0.000343,0.002238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.000834,0.001181,0.001048,0.001909,0.006710,0.002374,0.000743,0.002414,0.000310,...,0.000530,0.004438,0.001619,0.171051,0.006616,0.000860,0.006085,0.001164,0.000538,0.000788
3978,id_ff925dd0d,0.002922,0.002213,0.001309,0.010698,0.022298,0.007703,0.006419,0.005270,0.001092,...,0.000811,0.001140,0.003834,0.002317,0.002932,0.000901,0.001969,0.001946,0.000564,0.002124
3979,id_ffb710450,0.001488,0.001273,0.001212,0.012353,0.033735,0.006725,0.002481,0.004792,0.000278,...,0.000678,0.000745,0.002816,0.002256,0.001568,0.000672,0.001146,0.001618,0.000601,0.001291
3980,id_ffbb869f2,0.002003,0.001525,0.001547,0.024451,0.028090,0.006004,0.006469,0.003622,0.000668,...,0.000922,0.000614,0.002713,0.001172,0.002303,0.000744,0.001500,0.002301,0.000509,0.003338


In [23]:
test[target_cols]

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0.000601,0.000580,0.002132,0.014111,0.019372,0.004123,0.001981,0.004428,0.000180,0.012393,...,0.000829,0.001340,0.003232,0.001830,0.000871,0.000555,0.001077,0.001612,0.009336,0.001377
1,0.000316,0.000408,0.001128,0.000984,0.001354,0.001008,0.002878,0.013349,0.005101,0.003829,...,0.000565,0.000343,0.004205,0.000170,0.008188,0.000308,0.010038,0.000603,0.000206,0.001848
2,0.001089,0.000915,0.001431,0.011650,0.017050,0.005956,0.003185,0.004950,0.000278,0.010074,...,0.000621,0.000886,0.002419,0.010345,0.005757,0.000644,0.001493,0.002048,0.000887,0.002970
3,0.001757,0.001432,0.002156,0.021457,0.023154,0.004425,0.007014,0.002649,0.000438,0.013337,...,0.000919,0.000616,0.004727,0.001542,0.001235,0.000846,0.001258,0.002104,0.000423,0.002304
4,0.000832,0.000563,0.002095,0.017093,0.020522,0.003890,0.005721,0.001667,0.000407,0.016768,...,0.000814,0.000619,0.004106,0.001329,0.001049,0.000487,0.000345,0.002103,0.000192,0.001450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3619,0.000582,0.000647,0.000516,0.002353,0.006627,0.001886,0.000435,0.001814,0.000201,0.003199,...,0.000284,0.001867,0.001109,0.173384,0.006523,0.000580,0.004224,0.000901,0.000295,0.000629
3620,0.002929,0.002572,0.000990,0.011168,0.023285,0.007199,0.005775,0.004192,0.000486,0.013829,...,0.000614,0.000678,0.004173,0.002656,0.001945,0.001026,0.002470,0.001675,0.000373,0.001558
3621,0.001383,0.000937,0.000854,0.013465,0.035332,0.005973,0.003065,0.003157,0.000159,0.011860,...,0.000385,0.000493,0.002019,0.001526,0.001354,0.000481,0.000658,0.001150,0.000337,0.001228
3622,0.001825,0.001273,0.001146,0.021896,0.032116,0.005774,0.004900,0.004232,0.000292,0.015667,...,0.000541,0.000439,0.003301,0.001051,0.002008,0.000628,0.002060,0.001620,0.000622,0.002700
