In [1]:
import os
import gc
import math
import random
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

# import optuna
# from optuna.integration import PyTorchLightningPruningCallback

import tensorboard

%matplotlib inline
%load_ext tensorboard

In [2]:
SEED              = 42 # 69420
TRAIN_COLUMNS     = ["sig_id", "cp_type", "cp_time", "cp_dose", "g-0", "g-1", "g-2", "g-3", "g-4", "g-5", "g-6", "g-7", "g-8", "g-9", "g-10", "g-11", "g-12", "g-13", "g-14", "g-15", "g-16", "g-17", "g-18", "g-19", "g-20", "g-21", "g-22", "g-23", "g-24", "g-25", "g-26", "g-27", "g-28", "g-29", "g-30", "g-31", "g-32", "g-33", "g-34", "g-35", "g-36", "g-37", "g-38", "g-39", "g-40", "g-41", "g-42", "g-43", "g-44", "g-45", "g-46", "g-47", "g-48", "g-49", "g-50", "g-51", "g-52", "g-53", "g-54", "g-55", "g-56", "g-57", "g-58", "g-59", "g-60", "g-61", "g-62", "g-63", "g-64", "g-65", "g-66", "g-67", "g-68", "g-69", "g-70", "g-71", "g-72", "g-73", "g-74", "g-75", "g-76", "g-77", "g-78", "g-79", "g-80", "g-81", "g-82", "g-83", "g-84", "g-85", "g-86", "g-87", "g-88", "g-89", "g-90", "g-91", "g-92", "g-93", "g-94", "g-95", "g-96", "g-97", "g-98", "g-99", "g-100", "g-101", "g-102", "g-103", "g-104", "g-105", "g-106", "g-107", "g-108", "g-109", "g-110", "g-111", "g-112", "g-113", "g-114", "g-115", "g-116", "g-117", "g-118", "g-119", "g-120", "g-121", "g-122", "g-123", "g-124", "g-125", "g-126", "g-127", "g-128", "g-129", "g-130", "g-131", "g-132", "g-133", "g-134", "g-135", "g-136", "g-137", "g-138", "g-139", "g-140", "g-141", "g-142", "g-143", "g-144", "g-145", "g-146", "g-147", "g-148", "g-149", "g-150", "g-151", "g-152", "g-153", "g-154", "g-155", "g-156", "g-157", "g-158", "g-159", "g-160", "g-161", "g-162", "g-163", "g-164", "g-165", "g-166", "g-167", "g-168", "g-169", "g-170", "g-171", "g-172", "g-173", "g-174", "g-175", "g-176", "g-177", "g-178", "g-179", "g-180", "g-181", "g-182", "g-183", "g-184", "g-185", "g-186", "g-187", "g-188", "g-189", "g-190", "g-191", "g-192", "g-193", "g-194", "g-195", "g-196", "g-197", "g-198", "g-199", "g-200", "g-201", "g-202", "g-203", "g-204", "g-205", "g-206", "g-207", "g-208", "g-209", "g-210", "g-211", "g-212", "g-213", "g-214", "g-215", "g-216", "g-217", "g-218", "g-219", "g-220", "g-221", "g-222", "g-223", "g-224", "g-225", "g-226", "g-227", "g-228", "g-229", "g-230", "g-231", "g-232", "g-233", "g-234", "g-235", "g-236", "g-237", "g-238", "g-239", "g-240", "g-241", "g-242", "g-243", "g-244", "g-245", "g-246", "g-247", "g-248", "g-249", "g-250", "g-251", "g-252", "g-253", "g-254", "g-255", "g-256", "g-257", "g-258", "g-259", "g-260", "g-261", "g-262", "g-263", "g-264", "g-265", "g-266", "g-267", "g-268", "g-269", "g-270", "g-271", "g-272", "g-273", "g-274", "g-275", "g-276", "g-277", "g-278", "g-279", "g-280", "g-281", "g-282", "g-283", "g-284", "g-285", "g-286", "g-287", "g-288", "g-289", "g-290", "g-291", "g-292", "g-293", "g-294", "g-295", "g-296", "g-297", "g-298", "g-299", "g-300", "g-301", "g-302", "g-303", "g-304", "g-305", "g-306", "g-307", "g-308", "g-309", "g-310", "g-311", "g-312", "g-313", "g-314", "g-315", "g-316", "g-317", "g-318", "g-319", "g-320", "g-321", "g-322", "g-323", "g-324", "g-325", "g-326", "g-327", "g-328", "g-329", "g-330", "g-331", "g-332", "g-333", "g-334", "g-335", "g-336", "g-337", "g-338", "g-339", "g-340", "g-341", "g-342", "g-343", "g-344", "g-345", "g-346", "g-347", "g-348", "g-349", "g-350", "g-351", "g-352", "g-353", "g-354", "g-355", "g-356", "g-357", "g-358", "g-359", "g-360", "g-361", "g-362", "g-363", "g-364", "g-365", "g-366", "g-367", "g-368", "g-369", "g-370", "g-371", "g-372", "g-373", "g-374", "g-375", "g-376", "g-377", "g-378", "g-379", "g-380", "g-381", "g-382", "g-383", "g-384", "g-385", "g-386", "g-387", "g-388", "g-389", "g-390", "g-391", "g-392", "g-393", "g-394", "g-395", "g-396", "g-397", "g-398", "g-399", "g-400", "g-401", "g-402", "g-403", "g-404", "g-405", "g-406", "g-407", "g-408", "g-409", "g-410", "g-411", "g-412", "g-413", "g-414", "g-415", "g-416", "g-417", "g-418", "g-419", "g-420", "g-421", "g-422", "g-423", "g-424", "g-425", "g-426", "g-427", "g-428", "g-429", "g-430", "g-431", "g-432", "g-433", "g-434", "g-435", "g-436", "g-437", "g-438", "g-439", "g-440", "g-441", "g-442", "g-443", "g-444", "g-445", "g-446", "g-447", "g-448", "g-449", "g-450", "g-451", "g-452", "g-453", "g-454", "g-455", "g-456", "g-457", "g-458", "g-459", "g-460", "g-461", "g-462", "g-463", "g-464", "g-465", "g-466", "g-467", "g-468", "g-469", "g-470", "g-471", "g-472", "g-473", "g-474", "g-475", "g-476", "g-477", "g-478", "g-479", "g-480", "g-481", "g-482", "g-483", "g-484", "g-485", "g-486", "g-487", "g-488", "g-489", "g-490", "g-491", "g-492", "g-493", "g-494", "g-495", "g-496", "g-497", "g-498", "g-499", "g-500", "g-501", "g-502", "g-503", "g-504", "g-505", "g-506", "g-507", "g-508", "g-509", "g-510", "g-511", "g-512", "g-513", "g-514", "g-515", "g-516", "g-517", "g-518", "g-519", "g-520", "g-521", "g-522", "g-523", "g-524", "g-525", "g-526", "g-527", "g-528", "g-529", "g-530", "g-531", "g-532", "g-533", "g-534", "g-535", "g-536", "g-537", "g-538", "g-539", "g-540", "g-541", "g-542", "g-543", "g-544", "g-545", "g-546", "g-547", "g-548", "g-549", "g-550", "g-551", "g-552", "g-553", "g-554", "g-555", "g-556", "g-557", "g-558", "g-559", "g-560", "g-561", "g-562", "g-563", "g-564", "g-565", "g-566", "g-567", "g-568", "g-569", "g-570", "g-571", "g-572", "g-573", "g-574", "g-575", "g-576", "g-577", "g-578", "g-579", "g-580", "g-581", "g-582", "g-583", "g-584", "g-585", "g-586", "g-587", "g-588", "g-589", "g-590", "g-591", "g-592", "g-593", "g-594", "g-595", "g-596", "g-597", "g-598", "g-599", "g-600", "g-601", "g-602", "g-603", "g-604", "g-605", "g-606", "g-607", "g-608", "g-609", "g-610", "g-611", "g-612", "g-613", "g-614", "g-615", "g-616", "g-617", "g-618", "g-619", "g-620", "g-621", "g-622", "g-623", "g-624", "g-625", "g-626", "g-627", "g-628", "g-629", "g-630", "g-631", "g-632", "g-633", "g-634", "g-635", "g-636", "g-637", "g-638", "g-639", "g-640", "g-641", "g-642", "g-643", "g-644", "g-645", "g-646", "g-647", "g-648", "g-649", "g-650", "g-651", "g-652", "g-653", "g-654", "g-655", "g-656", "g-657", "g-658", "g-659", "g-660", "g-661", "g-662", "g-663", "g-664", "g-665", "g-666", "g-667", "g-668", "g-669", "g-670", "g-671", "g-672", "g-673", "g-674", "g-675", "g-676", "g-677", "g-678", "g-679", "g-680", "g-681", "g-682", "g-683", "g-684", "g-685", "g-686", "g-687", "g-688", "g-689", "g-690", "g-691", "g-692", "g-693", "g-694", "g-695", "g-696", "g-697", "g-698", "g-699", "g-700", "g-701", "g-702", "g-703", "g-704", "g-705", "g-706", "g-707", "g-708", "g-709", "g-710", "g-711", "g-712", "g-713", "g-714", "g-715", "g-716", "g-717", "g-718", "g-719", "g-720", "g-721", "g-722", "g-723", "g-724", "g-725", "g-726", "g-727", "g-728", "g-729", "g-730", "g-731", "g-732", "g-733", "g-734", "g-735", "g-736", "g-737", "g-738", "g-739", "g-740", "g-741", "g-742", "g-743", "g-744", "g-745", "g-746", "g-747", "g-748", "g-749", "g-750", "g-751", "g-752", "g-753", "g-754", "g-755", "g-756", "g-757", "g-758", "g-759", "g-760", "g-761", "g-762", "g-763", "g-764", "g-765", "g-766", "g-767", "g-768", "g-769", "g-770", "g-771", "c-0", "c-1", "c-2", "c-3", "c-4", "c-5", "c-6", "c-7", "c-8", "c-9", "c-10", "c-11", "c-12", "c-13", "c-14", "c-15", "c-16", "c-17", "c-18", "c-19", "c-20", "c-21", "c-22", "c-23", "c-24", "c-25", "c-26", "c-27", "c-28", "c-29", "c-30", "c-31", "c-32", "c-33", "c-34", "c-35", "c-36", "c-37", "c-38", "c-39", "c-40", "c-41", "c-42", "c-43", "c-44", "c-45", "c-46", "c-47", "c-48", "c-49", "c-50", "c-51", "c-52", "c-53", "c-54", "c-55", "c-56", "c-57", "c-58", "c-59", "c-60", "c-61", "c-62", "c-63", "c-64", "c-65", "c-66", "c-67", "c-68", "c-69", "c-70", "c-71", "c-72", "c-73", "c-74", "c-75", "c-76", "c-77", "c-78", "c-79", "c-80", "c-81", "c-82", "c-83", "c-84", "c-85", "c-86", "c-87", "c-88", "c-89", "c-90", "c-91", "c-92", "c-93", "c-94", "c-95", "c-96", "c-97", "c-98", "c-99"]
TARGET_COLUMNS    = ['5-alpha_reductase_inhibitor', '11-beta-hsd1_inhibitor', 'acat_inhibitor', 'acetylcholine_receptor_agonist', 'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor', 'adenosine_receptor_agonist', 'adenosine_receptor_antagonist', 'adenylyl_cyclase_activator', 'adrenergic_receptor_agonist', 'adrenergic_receptor_antagonist', 'akt_inhibitor', 'aldehyde_dehydrogenase_inhibitor', 'alk_inhibitor', 'ampk_activator', 'analgesic', 'androgen_receptor_agonist', 'androgen_receptor_antagonist', 'anesthetic_-_local', 'angiogenesis_inhibitor', 'angiotensin_receptor_antagonist', 'anti-inflammatory', 'antiarrhythmic', 'antibiotic', 'anticonvulsant', 'antifungal', 'antihistamine', 'antimalarial', 'antioxidant', 'antiprotozoal', 'antiviral', 'apoptosis_stimulant', 'aromatase_inhibitor', 'atm_kinase_inhibitor', 'atp-sensitive_potassium_channel_antagonist', 'atp_synthase_inhibitor', 'atpase_inhibitor', 'atr_kinase_inhibitor', 'aurora_kinase_inhibitor', 'autotaxin_inhibitor', 'bacterial_30s_ribosomal_subunit_inhibitor', 'bacterial_50s_ribosomal_subunit_inhibitor', 'bacterial_antifolate', 'bacterial_cell_wall_synthesis_inhibitor', 'bacterial_dna_gyrase_inhibitor', 'bacterial_dna_inhibitor', 'bacterial_membrane_integrity_inhibitor', 'bcl_inhibitor', 'bcr-abl_inhibitor', 'benzodiazepine_receptor_agonist', 'beta_amyloid_inhibitor', 'bromodomain_inhibitor', 'btk_inhibitor', 'calcineurin_inhibitor', 'calcium_channel_blocker', 'cannabinoid_receptor_agonist', 'cannabinoid_receptor_antagonist', 'carbonic_anhydrase_inhibitor', 'casein_kinase_inhibitor', 'caspase_activator', 'catechol_o_methyltransferase_inhibitor', 'cc_chemokine_receptor_antagonist', 'cck_receptor_antagonist', 'cdk_inhibitor', 'chelating_agent', 'chk_inhibitor', 'chloride_channel_blocker', 'cholesterol_inhibitor', 'cholinergic_receptor_antagonist', 'coagulation_factor_inhibitor', 'corticosteroid_agonist', 'cyclooxygenase_inhibitor', 'cytochrome_p450_inhibitor', 'dihydrofolate_reductase_inhibitor', 'dipeptidyl_peptidase_inhibitor', 'diuretic', 'dna_alkylating_agent', 'dna_inhibitor', 'dopamine_receptor_agonist', 'dopamine_receptor_antagonist', 'egfr_inhibitor', 'elastase_inhibitor', 'erbb2_inhibitor', 'estrogen_receptor_agonist', 'estrogen_receptor_antagonist', 'faah_inhibitor', 'farnesyltransferase_inhibitor', 'fatty_acid_receptor_agonist', 'fgfr_inhibitor', 'flt3_inhibitor', 'focal_adhesion_kinase_inhibitor', 'free_radical_scavenger', 'fungal_squalene_epoxidase_inhibitor', 'gaba_receptor_agonist', 'gaba_receptor_antagonist', 'gamma_secretase_inhibitor', 'glucocorticoid_receptor_agonist', 'glutamate_inhibitor', 'glutamate_receptor_agonist', 'glutamate_receptor_antagonist', 'gonadotropin_receptor_agonist', 'gsk_inhibitor', 'hcv_inhibitor', 'hdac_inhibitor', 'histamine_receptor_agonist', 'histamine_receptor_antagonist', 'histone_lysine_demethylase_inhibitor', 'histone_lysine_methyltransferase_inhibitor', 'hiv_inhibitor', 'hmgcr_inhibitor', 'hsp_inhibitor', 'igf-1_inhibitor', 'ikk_inhibitor', 'imidazoline_receptor_agonist', 'immunosuppressant', 'insulin_secretagogue', 'insulin_sensitizer', 'integrin_inhibitor', 'jak_inhibitor', 'kit_inhibitor', 'laxative', 'leukotriene_inhibitor', 'leukotriene_receptor_antagonist', 'lipase_inhibitor', 'lipoxygenase_inhibitor', 'lxr_agonist', 'mdm_inhibitor', 'mek_inhibitor', 'membrane_integrity_inhibitor', 'mineralocorticoid_receptor_antagonist', 'monoacylglycerol_lipase_inhibitor', 'monoamine_oxidase_inhibitor', 'monopolar_spindle_1_kinase_inhibitor', 'mtor_inhibitor', 'mucolytic_agent', 'neuropeptide_receptor_antagonist', 'nfkb_inhibitor', 'nicotinic_receptor_agonist', 'nitric_oxide_donor', 'nitric_oxide_production_inhibitor', 'nitric_oxide_synthase_inhibitor', 'norepinephrine_reuptake_inhibitor', 'nrf2_activator', 'opioid_receptor_agonist', 'opioid_receptor_antagonist', 'orexin_receptor_antagonist', 'p38_mapk_inhibitor', 'p-glycoprotein_inhibitor', 'parp_inhibitor', 'pdgfr_inhibitor', 'pdk_inhibitor', 'phosphodiesterase_inhibitor', 'phospholipase_inhibitor', 'pi3k_inhibitor', 'pkc_inhibitor', 'potassium_channel_activator', 'potassium_channel_antagonist', 'ppar_receptor_agonist', 'ppar_receptor_antagonist', 'progesterone_receptor_agonist', 'progesterone_receptor_antagonist', 'prostaglandin_inhibitor', 'prostanoid_receptor_antagonist', 'proteasome_inhibitor', 'protein_kinase_inhibitor', 'protein_phosphatase_inhibitor', 'protein_synthesis_inhibitor', 'protein_tyrosine_kinase_inhibitor', 'radiopaque_medium', 'raf_inhibitor', 'ras_gtpase_inhibitor', 'retinoid_receptor_agonist', 'retinoid_receptor_antagonist', 'rho_associated_kinase_inhibitor', 'ribonucleoside_reductase_inhibitor', 'rna_polymerase_inhibitor', 'serotonin_receptor_agonist', 'serotonin_receptor_antagonist', 'serotonin_reuptake_inhibitor', 'sigma_receptor_agonist', 'sigma_receptor_antagonist', 'smoothened_receptor_antagonist', 'sodium_channel_inhibitor', 'sphingosine_receptor_agonist', 'src_inhibitor', 'steroid', 'syk_inhibitor', 'tachykinin_antagonist', 'tgf-beta_receptor_inhibitor', 'thrombin_inhibitor', 'thymidylate_synthase_inhibitor', 'tlr_agonist', 'tlr_antagonist', 'tnf_inhibitor', 'topoisomerase_inhibitor', 'transient_receptor_potential_channel_antagonist', 'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist', 'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor', 'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b', 'vitamin_d_receptor_agonist', 'wnt_inhibitor']
GENES             = [col for col in TRAIN_COLUMNS if col.startswith('g-')]
CELLS             = [col for col in TRAIN_COLUMNS if col.startswith('c-')]
GENE_PCA_COMP     = 450
CELL_PCA_COMP     = 45
VARIANCE_THRESHOLD= 0.67
VERBOSE           = True
FOLDS             = 7
INPUT_SIZE        = None
OUTPUT_SIZE       = None
SAVE_FOLDS        = True
USE_SAVED_FOLDS   = True

PATH              = "../data/"
TRAIN_F           = os.path.join(PATH, "train_features.csv")
TRAIN_T           = os.path.join(PATH, "train_targets_scored.csv")
TRAIN_T_NS        = os.path.join(PATH, "train_targets_nonscored.csv")
TEST_F            = os.path.join(PATH, "test_features.csv")
SAMPLE_SUBMISSION = os.path.join(PATH, "sample_submission.csv")
PRE_FEATURES_CSV  = os.path.join(PATH, "preprocessed_train_features.csv")
PRE_TARGETS_CSV   = os.path.join(PATH, "preprocessed_train_targets.csv")
PRE_TEST_CSV      = os.path.join(PATH, "preprocessed_test_features.csv")

seed_everything(SEED)

42

In [3]:
class MoADataset(Dataset):
    def __init__(self, features, targets):
        if targets is not None:
            self.is_test = False
        else:
            self.is_test = True
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return self.features.shape[0]
        
    def __getitem__(self, index):
        if self.is_test:
            return torch.tensor(self.features[index, :], dtype=torch.float)
        else:
            return (torch.tensor(self.features[index, :], dtype=torch.float), 
                    torch.tensor(self.targets[index, :], dtype=torch.float))

In [4]:
class MoADatasetModule(pl.LightningDataModule):
    def __init__(self, fold=0):
        super().__init__()
        self.batch_size = BATCH_SIZE
        self.fold = fold
        
    def prepare_data(self):
        if USE_SAVED_FOLDS and os.path.isfile(PRE_FEATURES_CSV) and os.path.isfile(PRE_TARGETS_CSV) and os.path.isfile(PRE_TEST_CSV):
            self.f = pd.read_csv(PRE_FEATURES_CSV)
            self.t = pd.read_csv(PRE_TARGETS_CSV)
            self.test_f = pd.read_csv(PRE_TEST_CSV)
            
            if VERBOSE:
                print("Saved Folds Loaded.")
            
        else:
            f = pd.read_csv(TRAIN_F)
            t = pd.read_csv(TRAIN_T)
            t_s = pd.read_csv(TRAIN_T_NS)
            test_f = pd.read_csv(TEST_F)

            f, t, test_f = self._extract_features(f, t, test_f)
            f, t, test_f = self._stratify(f, t, test_f, shuffle=False)
            self.f = f
            self.t = t
            self.test_f = test_f
            
            del f, t, test_f
            gc.collect()

            if SAVE_FOLDS:
                self.f.to_csv(PRE_FEATURES_CSV, index=False)
                self.t.to_csv(PRE_TARGETS_CSV, index=False)
                self.test_f.to_csv(PRE_TEST_CSV, index=False)
                
                if VERBOSE:
                    print("Folds Saved.")
        
        if VERBOSE:
            print("Dataset Prepared.")
        
    def _stratify(self, f, t, test_f, shuffle=True):
        f.loc[:, "kfold"] = -1
        if shuffle:
            f = f.sample(frac=1).reset_index(drop=True)        
        
        mskf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
        for fold_, (train_, val_) in enumerate(mskf.split(X=f, y=t)): 
            f.loc[val_, "kfold"] = fold_
        
        if VERBOSE:
            print("Dataset Stratified.")
        return f, t, test_f
    
    def _extract_features(self, f, t, test_f):
        f, t, test_f = self._mapping_and_filter(f, t, test_f)
#         f, t, test_f = self._bin_columns(f, t, test_f)
#         f, t, test_f = self._naive_outlier_removal(f, t, test_f)
#         f, t, test_f = self._add_stats(f, t, test_f)
        f, t, test_f = self._quantile_transform(f, t, test_f)
        f, t, test_f = self._PCA(f, t, test_f, drop_original=False)
        f, t, test_f = self._variance_thresholding(f, t, test_f)
#         f, t, test_f = self._scaling(f, t, test_f)
        f, t, test_f = self._align_features_and_targets(f, t, test_f)
    
        if VERBOSE:
            print("Features Extracted.")
        return f, t, test_f
    
    def _align_features_and_targets(self, f, t, test_f):
        columns = ["sig_id"]
        columns.extend(TARGET_COLUMNS)
        t = f.merge(t, how="inner", on="sig_id").loc[:, columns]
        if VERBOSE:
            print("Features and Targets Aligned.")
            
        del columns
        gc.collect()
        
        return f, t, test_f
    
    def _mapping_and_filter(self, f, t, test_f, drop_cp=True):
        cp_type = {'trt_cp': 0, 'ctl_vehicle': 1}
        cp_dose = {'D1': 0, 'D2': 1}
        cp_time = {24: 0, 48: 1, 72:2}
        for df in [f, test_f]:
            df['cp_type'] = df['cp_type'].map(cp_type)
            df['cp_dose'] = df['cp_dose'].map(cp_dose)
            df['cp_time'] = df['cp_time'].map(cp_time)
        if drop_cp:
            t = t[f['cp_type'] == 0].reset_index(drop = True)
            f = f[f['cp_type'] == 0].reset_index(drop = True)
            f = f.drop("cp_type", axis=1)
        
        if VERBOSE:
            print("Features Mapped to Integers.")
        
        del cp_type, cp_dose, cp_time
        gc.collect()
        
        return f, t, test_f
    
    def _scaling(self, f, t, test_f):
        features = f.columns[3:]
        scaler = RobustScaler()
        scaler.fit(pd.concat([f[features], test_f[features]], axis = 0))
        f[features] = scaler.transform(f[features])
        test_f[features] = scaler.transform(test_f[features])
        
        if VERBOSE:
            print("Features Scaled.")
            
        del features, scaler
        gc.collect()
        
        return f, t, test_f
    
    def _variance_thresholding(self, f, t, test_f):
        var_thresh = VarianceThreshold(threshold=VARIANCE_THRESHOLD)
        
        if VERBOSE:
            print(f"Before Variance Thresholding {f.shape[1]} features present.")
        
        data = f.append(test_f)
        var_thresh.fit(data.iloc[:, 3:])
        data_transformed = data.iloc[:, 3:][data.iloc[:, 3:].columns[var_thresh.get_support(indices=True)]]
        train_features_transformed = data_transformed.iloc[:f.shape[0]]
        test_features_transformed = data_transformed.iloc[-test_f.shape[0]:]

        train_features = pd.DataFrame(f[['sig_id', 'cp_time', 'cp_dose']].values.reshape(-1, 3),
                                      columns=['sig_id', 'cp_time',  'cp_dose'])
        f = pd.concat([train_features, train_features_transformed], axis=1)
        test_features = pd.DataFrame(test_f[['sig_id', 'cp_time', 'cp_dose']].values.reshape(-1, 3),
                                     columns=['sig_id', 'cp_time', 'cp_dose'])
        test_f = pd.concat([test_features, test_features_transformed], axis=1)
        
        if VERBOSE:
            print(f"After Variance Thresholding {f.shape[1]} features present.")
            print("Variance Thresholding Done.")

        del data, data_transformed, train_features_transformed, test_features_transformed, train_features, test_features
        gc.collect()
        
        return f, t, test_f
        
    def _quantile_transform(self, f, t, test_f):
        QUANTILE_COLUMNS = [c for c in f.columns if c.startswith('c-')] + [c for c in f.columns if c.startswith('g-')] 
#         + [c for c in f.columns if c.startswith('stat-')]
        for col in (QUANTILE_COLUMNS):
            transformer = QuantileTransformer(n_quantiles=100, random_state=SEED, output_distribution="normal")
            vec_len = len(f[col].values)
            vec_len_test = len(test_f[col].values)
            raw_vec = f[col].values.reshape(vec_len, 1)
            transformer.fit(raw_vec)

            f[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
            test_f[col] = transformer.transform(test_f[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]
        
        if VERBOSE:
            print("Features Quantile Transformed.")
            
        del transformer, vec_len, vec_len_test, raw_vec, QUANTILE_COLUMNS
        gc.collect()
        
        return f, t, test_f
    
    def _add_stats(self, f, t, test_f):    
        for df in [f, test_f]:
            df['stat-g_sum'] = df[GENES].sum(axis = 1)
            df['stat-g_mean'] = df[GENES].mean(axis = 1)
            df['stat-g_std'] = df[GENES].std(axis = 1)
            df['stat-g_kurt'] = df[GENES].kurtosis(axis = 1)
            df['stat-g_skew'] = df[GENES].skew(axis = 1)
            df['stat-c_sum'] = df[CELLS].sum(axis = 1)
            df['stat-c_mean'] = df[CELLS].mean(axis = 1)
            df['stat-c_std'] = df[CELLS].std(axis = 1)
            df['stat-c_kurt'] = df[CELLS].kurtosis(axis = 1)
            df['stat-c_skew'] = df[CELLS].skew(axis = 1)
            df['stat-gc_sum'] = df[GENES + CELLS].sum(axis = 1)
            df['stat-gc_mean'] = df[GENES + CELLS].mean(axis = 1)
            df['stat-gc_std'] = df[GENES + CELLS].std(axis = 1)
            df['stat-gc_kurt'] = df[GENES + CELLS].kurtosis(axis = 1)
            df['stat-gc_skew'] = df[GENES + CELLS].skew(axis = 1)
        
        if VERBOSE:
            print("Feature Stats Added.")
        
        return f, t, test_f
        
        
    def _bin_columns(self, f, t, test_f, drop_original=False):
        for col in GENES:
            f.loc[:, f'{col}_bin'] = pd.cut(f[col], bins=3, labels=False)
            test_f.loc[:, f'{col}_bin'] = pd.cut(test_f[col], bins=3, labels=False)

        if drop_original:
            f.drop(GENES).reset_index(drop=True)
            test_f.drop(GENES).reset_index(drop=True)

        if VERBOSE:
            print("Features Binned.")
            
        return f, t, test_f
        
    def _naive_outlier_removal(self, f, t, test_f):
        train_ = f.copy() # Didn't wanted to actually normalize, so created a copy and normalized that for further calculation
        for col in GENES:
        #     train_[col] = (train[col]-np.mean(train[col])) / (np.std(train[col]))
            mean = train_[col].mean()
            std = train_[col].std()

            std_r = mean + 4*std
            std_l = mean - 4*std

            drop = train_[col][(train_[col]>std_r) | (train_[col]<std_l)].index.values

        f = f.drop(drop).reset_index(drop=True)
        t = t.drop(drop).reset_index(drop=True)
        
        if VERBOSE:
            print("Feature Outliers Removed.")
            
        del train_, mean, std, std_r, std_l, drop
        gc.collect()
            
        return f, t, test_f
        
    def _PCA(self, f, t, test_f, drop_original=False):
        def create_pca(train, test, features, kind, n_components):
            train_ = train[features].copy()
            test_ = test[features].copy()
            data = pd.concat([train_, test_], axis = 0)
            pca = PCA(n_components = n_components,  random_state = SEED)
            data = pca.fit_transform(data)
            columns = [f'{kind}-pca-{i + 1}' for i in range(n_components)]
            data = pd.DataFrame(data, columns = columns)
            train_ = data.iloc[:train.shape[0]]
            test_ = data.iloc[train.shape[0]:].reset_index(drop = True)
            train = pd.concat([train, train_], axis = 1)
            test = pd.concat([test, test_], axis = 1)
            return train, test

        f, test_f = create_pca(f, test_f, GENES, kind = 'g', n_components = GENE_PCA_COMP)
        f, test_f = create_pca(f, test_f, CELLS, kind = 'c', n_components = CELL_PCA_COMP)
        if drop_original:
            f = f.drop(GENES).reset_index(drop=True)
            t = t.drop(CELLS).reset_index(drop=True)
                
        if VERBOSE:
            print("PCA Performed.")            
            
        return f, t, test_f
    
    def setup(self, stage=None):
        global INPUT_SIZE, OUTPUT_SIZE
        train_X = self.f[self.f.kfold != self.fold]
        self.train_X = train_X.drop(['kfold', 'sig_id'], axis=1).to_numpy().astype('float64')
        train_Y = self.t[self.f.kfold != self.fold]
        self.train_Y = train_Y.drop(['sig_id'], axis=1).to_numpy().astype('float64')
        
        INPUT_SIZE = self.train_X.shape[1]
        OUTPUT_SIZE = self.train_Y.shape[1]
        
        print(f"Dataset has {INPUT_SIZE} features.")
        
        valid_X = self.f[self.f.kfold == self.fold]
        self.valid_X = valid_X.drop(['kfold', 'sig_id'], axis=1).to_numpy().astype('float64')
        valid_Y = self.t[self.f.kfold == self.fold]
        self.valid_Y = valid_Y.drop(['sig_id'], axis=1).to_numpy().astype('float64')
        
        self.test_X = self.test_f.to_numpy()
        
        if VERBOSE:
            print("Dataset Setup.")
            
        del train_X, train_Y, valid_X, valid_Y
        gc.collect()
        
    def train_dataloader(self):
        if VERBOSE:
            print("Train Dataloader Loaded.")
        return DataLoader(MoADataset(self.train_X, self.train_Y), self.batch_size, num_workers=0, shuffle=True)
    
    def val_dataloader(self):
        if VERBOSE:
            print("Validation Dataloader Loaded.")
        return DataLoader(MoADataset(self.valid_X, self.valid_Y), self.batch_size, num_workers=0, shuffle=False)
    
    def test_dataloader(self):
        if VERBOSE:
            print("Test Dataloader Loaded.")
        return DataLoader(MoaDataset(self.test_X, None), self.batch_size, num_workers=0, shuffle=False)

In [5]:
# Just to update INPUT_SIZE and OUTPUT_SIZE


In [6]:
INPUT_SIZE

In [7]:
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0, pos_weight=None):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        if pos_weight is not None:
            self.pos_weight = torch.tensor(pos_weight).cuda()
        else:
            self.pos_weight = pos_weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
                                           self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets, self.weight, pos_weight=self.pos_weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [8]:
class MoANet1(nn.Module):
    name = "MoANet1"
    def __init__(self, num_features, num_targets, trial=None):
        super().__init__()
        
        self.LAYER_OUTPUTS = [2048, 4096, 2048, 1024, 512]
        self.DROPOUT = 0.5
        self.NUM_LAYERS = 5
        
        if trial is not None:
            f_dropout = trial.suggest_float('f_dropout', 0.2, 0.5)
        else:
            f_dropout = self.DROPOUT
        
        layers = []

        # Intermediate layers
        in_size = num_features
        for i in range(self.NUM_LAYERS):
            out_size = self.LAYER_OUTPUTS[i]
#             out_size = trial.suggest_int('n_units_{}'.format(i), 256, 4096)
            layers.append(nn.utils.weight_norm(torch.nn.Linear(in_size, self.LAYER_OUTPUTS[i], bias=False), name=f'weight'))
            layers.append(nn.BatchNorm1d(out_size))
            layers.append(nn.Dropout(f_dropout))
            layers.append(nn.PReLU())
            in_size = out_size

        # Final layer
        layers.append(torch.nn.Linear(in_size, num_targets))
    
        self.model = torch.nn.Sequential(*layers)
        
        # Initialize weights
        self.model.apply(self._init_weights)
        
    def _init_weights(self, m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
            if m.bias != None:
                m.bias.data.fill_(0.01)
        
    def forward(self, x):
        x = self.model(x)
        return x

In [9]:
class MoANet2(nn.Module):
    name = "MoANet2"
    def __init__(self, num_features, num_targets):
        super(MoANet2, self).__init__()
        
        self.HIDDEN_LAYER_SIZE = 1500
        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, self.HIDDEN_LAYER_SIZE))
        
        self.batch_norm2 = nn.BatchNorm1d(self.HIDDEN_LAYER_SIZE)
        self.dropout2 = nn.Dropout(0.25)
        self.dense2 = nn.utils.weight_norm(nn.Linear(self.HIDDEN_LAYER_SIZE, self.HIDDEN_LAYER_SIZE))
        
        self.batch_norm3 = nn.BatchNorm1d(self.HIDDEN_LAYER_SIZE)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(self.HIDDEN_LAYER_SIZE, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [10]:
class MoANet3(nn.Module):
    name = "MoANet3"
    def __init__(self, num_features, num_targets):
        super(MoANet3, self).__init__()
        
        self.HIDDEN_LAYER_SIZE = 1500
        
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, self.HIDDEN_LAYER_SIZE))
        self.batch_norm1 = nn.BatchNorm1d(self.HIDDEN_LAYER_SIZE)
        self.dropout2 = nn.Dropout(0.25)
        
        self.dense2 = nn.utils.weight_norm(nn.Linear(self.HIDDEN_LAYER_SIZE, self.HIDDEN_LAYER_SIZE))
        self.batch_norm2 = nn.BatchNorm1d(self.HIDDEN_LAYER_SIZE)
        self.dropout3 = nn.Dropout(0.25)
        
        self.dense3 = nn.utils.weight_norm(nn.Linear(self.HIDDEN_LAYER_SIZE, num_targets))
        self.batch_norm3 = nn.BatchNorm1d(num_targets)
    
    def forward(self, x):
        x = self.dense1(x)
        x = self.batch_norm1(x)
        x = F.leaky_relu(x)
        x = self.dropout2(x)
        
        x = self.dense2(x)
        x = self.batch_norm2(x)
        x = F.leaky_relu(x)
        x = self.dropout3(x)
        
        x = self.dense3(x)
        
        return x

In [11]:
class MoANet4(nn.Module):
    name = "MoANet4"
    def __init__(self, num_features, num_targets):
        super(MoANet4, self).__init__()
        
        self.HIDDEN_LAYER_SIZE_1 = 1536
        self.HIDDEN_LAYER_SIZE_2 = 2048
        self.HIDDEN_LAYER_SIZE_3 = 1536
        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, self.HIDDEN_LAYER_SIZE_1))
        
        self.batch_norm2 = nn.BatchNorm1d(self.HIDDEN_LAYER_SIZE_1)
        self.dropout2 = nn.Dropout(0.25)
        self.dense2 = nn.utils.weight_norm(nn.Linear(self.HIDDEN_LAYER_SIZE_1, self.HIDDEN_LAYER_SIZE_2))
        
        self.batch_norm3 = nn.BatchNorm1d(self.HIDDEN_LAYER_SIZE_2)
        self.dropout3 = nn.Dropout(0.25)
        self.dense3 = nn.utils.weight_norm(nn.Linear(self.HIDDEN_LAYER_SIZE_2, self.HIDDEN_LAYER_SIZE_3))
        
        self.batch_norm4 = nn.BatchNorm1d(self.HIDDEN_LAYER_SIZE_3)
        self.dropout4 = nn.Dropout(0.25)
        self.dense4 = nn.utils.weight_norm(nn.Linear(self.HIDDEN_LAYER_SIZE_3, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))
        
        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = self.dense4(x)
        
        return x

In [12]:
class LogisticRegression(nn.Module):
    name = "LogisticRegression"
    def __init__(self, num_features, num_targets):
        super(LogisticRegression, self).__init__()
                
        self.batch_norm = nn.BatchNorm1d(num_features)
        self.dense = nn.utils.weight_norm(nn.Linear(num_features, num_targets))
    
    def forward(self, x):
        x = self.batch_norm(x)
        x = self.dense(x)
        
        return x

In [13]:
class WideAndDeepLearning(nn.Module):
    name = "WideAndDeepLearning"
    def __init__(self, num_features, num_targets):
        super(WideAndDeepLearning, self).__init__()
        
        self.HIDDEN_LAYER_SIZE = 2048
        
        hidden_size = self.HIDDEN_LAYER_SIZE
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.wide   = nn.utils.weight_norm(nn.Linear(num_features, num_targets))
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.4)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.4)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
        
        self.batch_norm4 = nn.BatchNorm1d(2 * num_targets)
        self.dropout4 = nn.Dropout(0.4)
        self.dense4 = nn.utils.weight_norm(nn.Linear(2 * num_targets, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x2 = F.leaky_relu(self.dense1(x))
        w = F.leaky_relu(self.wide(x))
        
        x = self.batch_norm2(x2)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        x = w + x
        
        return x

In [14]:
class LinearReluBnDropout(nn.Module):
    name = "LinearReluBnDropout"
    def __init__(self, in_features, out_features):
        super(LinearReluBnDropout, self).__init__()
        
        self.DROPOUT = 0.4

        self.block = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(in_features, out_features)),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(out_features),
            nn.Dropout(self.DROPOUT)
        )

    def forward(self, x):
        x = self.block(x)

        return x


class TablarNet(nn.Module):
    name = "TablarNet"
    def __init__(self, num_features, num_targets):
        super(TablarNet, self).__init__()
        
        self.EMBEDDING_DIMENSIONS = [(2, 15), (3, 20), (2, 15)]
        self.DROPOUT = 0.4
        self.HIDDEN_SIZE = 2048
        self.CONTINUOUS_FEATURES = num_features - 3

        self.embedding_layer = nn.ModuleList([nn.Embedding(x, y) for x, y in self.EMBEDDING_DIMENSIONS])
        self.dropout = nn.Dropout(self.DROPOUT, inplace=True)

        self.first_bn_layer = nn.Sequential(
            nn.BatchNorm1d(self.CONTINUOUS_FEATURES),
            nn.Dropout(self.DROPOUT)
        )

        first_in_feature = self.CONTINUOUS_FEATURES + sum([y for x, y in self.EMBEDDING_DIMENSIONS])

        self.block = nn.Sequential(
            LinearReluBnDropout(in_features=first_in_feature,
                                out_features=self.HIDDEN_SIZE),
            LinearReluBnDropout(in_features=self.HIDDEN_SIZE,
                                out_features=self.HIDDEN_SIZE)
        )

        self.last = nn.Linear(self.HIDDEN_SIZE, num_targets)

    def forward(self, x):
        cont_f = x[:, 3:]
        cat_f = x[:, :3]

        cat_x = [layer(cat_f[:, i].long()) for i, layer in enumerate(self.embedding_layer)]
        cat_x = torch.cat(tuple(cat_x), 1)
        cat_x = self.dropout(cat_x)

        cont_x = self.first_bn_layer(cont_f)

        x = torch.cat([cont_x, cat_x], 1)

        x = self.block(x)
        x = self.last(x)

        return x

In [15]:
def logLoss(y_true, y_pred):
    score = 0
    for i in range(y_true.shape[1]):
        x = y_true[:, i]
        z = y_pred[:, i]
        score_ = log_loss(x, z, labels=[0, 1])
        score += score_ / y_true.shape[1]

    return score

In [24]:
# model.py

class MoALitModule(pl.LightningModule):
    def __init__(self, net, fold):
        super(MoALitModule, self).__init__()
        self.valid_criterion = nn.BCEWithLogitsLoss()
        self.net = net
        self.fold = fold
        self.train_losses = []
        self.valid_losses = []
        self.sklearn_valid_losses = []
        self.epoch = 1
        self.best_valid_loss = None
        
    def forward(self, x):
        return self.net(x)
        
#     def configure_optimizers(self):
#         optimizer = torch.optim.Adam(
#             params=self.parameters(), 
#             lr=LEARNING_RATE,
#             weight_decay=WEIGHT_DECAY
#         )
#         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#             optimizer,
#             patience=0,
#             factor=SCHEDULER_FACTOR,
#             verbose=LEARNING_VERBOSE
#         )
#         return {
#            'optimizer': optimizer,
#            'lr_scheduler': scheduler,
#            'monitor': 'val_loss'
#        }

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=15, eta_min=0)

        return [optimizer], [scheduler]
    
    def training_step(self, batch, batch_idx):        
        inputs, targets = batch
        outputs = self(inputs)
        
        if WEIGHT is not 1:
            self.train_criterion = SmoothBCEwLogits(smoothing=0.001, weight=((targets * (WEIGHT - 1)) + 1))
        else:
            self.train_criterion = SmoothBCEwLogits(smoothing=0.001)
        
        loss = self.train_criterion(outputs, targets)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.train_losses.append(loss.item())
        return loss
    
    def training_epoch_end(self, outputs):
        losses = 0.0
        for output in outputs:
            loss = output["loss"]
            losses += loss.item()
            
        losses /= len(outputs)
        
        print(f"Training Epoch {self.epoch} Loss: {losses}")
    
    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        loss = self.valid_criterion(outputs, targets)
        sklearn_loss = logLoss(y_true=targets.detach().cpu().numpy().astype('float64'),
                               y_pred=torch.sigmoid(outputs).detach().cpu().numpy().astype('float64'))
        
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('sklearn_loss', sklearn_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
        self.valid_losses.append(loss.item())
        self.sklearn_valid_losses.append(sklearn_loss.item())
        
        return loss, sklearn_loss
    
    def validation_epoch_end(self, outputs):
        losses = 0.0
        sklearn_losses = 0.0
        for output in outputs:
            loss, sklearn_loss = output
            losses += loss.item()
            sklearn_losses += sklearn_loss
            
        print(f"Average Validation Loss for Fold {self.fold}: {sum(self.valid_losses) / len(self.valid_losses)}")
            
        losses /= len(outputs)
        sklearn_losses /= len(outputs)
         
        if self.best_valid_loss is not None:
            self.best_valid_loss = min(self.best_valid_loss, losses)
        else:
            self.best_valid_loss = losses
        
        print(f"Validation Epoch         {self.epoch} Loss: {losses}")
        print(f"SKLearn Validation Epoch {self.epoch} Loss: {sklearn_losses}")
        self.epoch += 1
        
    def teardown(self, stage):
        print(f"Average Training Loss for Fold           {self.fold}: {sum(self.train_losses) / len(self.train_losses)}")
        print(f"Average Validation Loss for Fold         {self.fold}: {sum(self.valid_losses) / len(self.valid_losses)}")
        print(f"Average SKLearn Validation Loss for Fold {self.fold}: {sum(self.sklearn_valid_losses) / len(self.sklearn_valid_losses)}")
        print(f"Best Validation Loss for Fold            {self.fold}: {self.best_valid_loss}")

In [25]:
LEARNING_VERBOSE  = False
BATCH_SIZE        = 128
EARLY_STOPPING    = 10
LEARNING_RATE     = 0.01
WEIGHT            = 1
WEIGHT_DECAY      = 2e-5
SCHEDULER_FACTOR  = 0.5
SAVE_TOP_K        = 1
MAX_EPOCHS        = 2000

In [26]:
from pytorch_lightning import Callback
class MetricsCallback(Callback):
    def __init__(self):
        super().__init__()
        self.metrics = []
        self.lr = LEARNING_RATE

    def on_validation_end(self, trainer, pl_module):
#         print(trainer.lr_schedulers)
#         print(trainer.optimizers)
        for scheduler in trainer.lr_schedulers:
            param_groups = scheduler['scheduler'].optimizer.param_groups
            lr = param_groups[0]["lr"]
            if lr < self.lr:
                print(f"Learning Rate Reduced to {lr}")
            if lr > self.lr:
                print(f"Learning Rate Increased to {lr}")
                
            self.lr = lr
#             print(f"Learning Rate: {lr}")
        self.metrics.append(trainer.callback_metrics)

In [27]:
def get_trainer(net, fold):
    tb_logger = pl_loggers.TensorBoardLogger(
        save_dir="tb_logs", 
        name=f"base_fold_{fold}", 
        version=0
    )

    lr_monitor = LearningRateMonitor(
        logging_interval='step'
    )

    metrics_callback = MetricsCallback()

    checkpoint_callback = ModelCheckpoint(
        filepath=f'weights/{net.name}/moa-net={net.name}-fold={fold}-' + '{epoch:03d}-{val_loss:.5f}',
        monitor='val_loss',
        save_top_k=SAVE_TOP_K,
        verbose=LEARNING_VERBOSE
    )

    early_stop_callback = EarlyStopping(
        monitor='val_loss',
        min_delta=0.00,
        mode='min',
        patience=EARLY_STOPPING,
        verbose=LEARNING_VERBOSE
    )

    trainer = pl.Trainer(
        logger=tb_logger,
        max_epochs=MAX_EPOCHS,
        gpus=1, # -1 if torch.cuda.is_available() else None,
        callbacks=[lr_monitor, metrics_callback, early_stop_callback],
        checkpoint_callback=checkpoint_callback, # Do not save any checkpoints,
    )
    
    return trainer, checkpoint_callback, metrics_callback

In [28]:
def run_fold(net, fold, prediction_ids, submission_ids, prediction, submission, metrics_and_models):
    print(f"Running on Fold #{fold}")
    trainer, checkpoint_callback, metrics_callback = get_trainer(net, fold)

    dataset = MoADatasetModule(fold=fold)
    dataset.prepare_data()
    dataset.setup()

    prediction_ids += dataset.f[dataset.f.kfold==fold]["sig_id"].to_list()
    submission_ids = dataset.test_f["sig_id"].to_list()

    model = MoALitModule(net(INPUT_SIZE, OUTPUT_SIZE), fold)
    trainer.fit(model, dataset)

    print(f"Best Model for Fold #{fold} saved at {checkpoint_callback.best_model_path}.")
    model = MoALitModule.load_from_checkpoint(checkpoint_callback.best_model_path, net=net(INPUT_SIZE, OUTPUT_SIZE), fold=fold)

    prediction = np.concatenate([
        prediction,
        model(torch.tensor(dataset.valid_X.astype("float32"))).sigmoid().detach().cpu().float().numpy()
    ], axis=0)

    if submission is not None:
        submission = submission + (model(torch.tensor(dataset.test_X[:, 1:].astype("float32"))).sigmoid().detach().cpu().float().numpy() / FOLDS)
    else:
        submission = model(torch.tensor(dataset.test_X[:, 1:].astype("float32"))).sigmoid().detach().cpu().float().numpy() / FOLDS

    best_model_score = logLoss(dataset.valid_Y.astype('float64'),
                               model(torch.tensor(dataset.valid_X.astype("float32"))).sigmoid().detach().cpu().float().numpy().astype('float64'))
    
    metrics_and_models[net.name]["fold_metrics"].append(best_model_score)
    metrics_and_models[net.name]["fold_monitor"].append(metrics_callback)
    metrics_and_models[net.name]["fold_models_path"].append(checkpoint_callback.best_model_path)
    
    hello_world()

    del tb_logger, lr_monitor, metrics_callback, checkpoint_callback, early_stop_callback, dataset, trainer, model
    gc.collect()
    
    return prediction, submission, prediction_ids, submission_ids, metrics_and_models

In [29]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

nets = [MoANet2]
folds = list(range(FOLDS))[:1]
# nets = [LogisticRegression]
metrics_and_models = {}
predictions = []
submissions = []

def display_validation_loss():
    avg_validation_loss = 0.0
    for fold in folds:
        valid_loss = metrics_and_models[net.name]["fold_metrics"][fold]
        print(f"Validation Loss for Fold {fold}: {valid_loss}")
        avg_validation_loss += valid_loss

    avg_validation_loss /= FOLDS
    print(f"Average Validation Loss: {avg_validation_loss}")

for net in nets:
    print(f"Training Model : {net.name}")
    PREDICTION = os.path.join(PATH, f"{net.name}-prediction.csv")
    SUBMISSION = os.path.join(PATH, f"{net.name}-submission.csv")
    prediction_ids = []
    submission_ids = []
    columns = ["sig_id"]
    columns.extend(TARGET_COLUMNS)
    
    prediction = np.empty((0, len(columns) - 1))
    submission = None
    
    metrics_and_models[net.name] = {}
    metrics_and_models[net.name]["fold_metrics"] = []
    metrics_and_models[net.name]["fold_models_path"]  = []
    metrics_and_models[net.name]["fold_monitor"] = []

    for fold in folds:
        prediction, submission, prediction_ids, submission_ids, metrics_and_models = run_fold(net, fold, prediction_ids, submission_ids, prediction, submission, metrics_and_models)

    prediction = pd.DataFrame(np.concatenate([np.array(prediction_ids).reshape(-1, 1), prediction], axis=1), columns=columns)
    prediction.to_csv(PREDICTION, index=False)
    submission = pd.DataFrame(np.concatenate([np.array(submission_ids).reshape(-1, 1), submission], axis=1), columns=columns)
    submission.to_csv(SUBMISSION, index=False)
    
    display_validation_loss()
        
    predictions.append(prediction)
    submissions.append(submission)
    del prediction, submission
    gc.collect()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training Model : MoANet2
Running on Fold #0
Saved Folds Loaded.
Dataset Prepared.
Dataset has 1092 features.
Dataset Setup.



  | Name            | Type              | Params
------------------------------------------------------
0 | valid_criterion | BCEWithLogitsLoss | 0     
1 | net             | MoANet2           | 4 M   


Validation Dataloader Loaded.


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

Average Validation Loss for Fold 0: 0.6967620551586151
Validation Epoch         1 Loss: 0.6967620551586151
SKLearn Validation Epoch 1 Loss: 0.6967620481357617
Train Dataloader Loaded.


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.07034793454739782
Validation Epoch         2 Loss: 0.020234804898500443
SKLearn Validation Epoch 2 Loss: 0.020234804724473116
Training Epoch 3 Loss: 0.0841736388171003


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.045664051798387215
Validation Epoch         3 Loss: 0.01900545842945576
SKLearn Validation Epoch 3 Loss: 0.01900545860636831
Learning Rate Reduced to 0.009890738003669028
Training Epoch 4 Loss: 0.022848630070584974


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.03670043947005814
Validation Epoch         4 Loss: 0.018056125827133656
SKLearn Validation Epoch 4 Loss: 0.018056125842320837
Learning Rate Reduced to 0.009567727288213004
Training Epoch 5 Loss: 0.021661042267469323


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.031983919825185746
Validation Epoch         5 Loss: 0.017457039318978785
SKLearn Validation Epoch 5 Loss: 0.01745703926148191
Learning Rate Reduced to 0.009045084971874737
Training Epoch 6 Loss: 0.020674473320951268


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.029057764822221176
Validation Epoch         6 Loss: 0.01711905241012573
SKLearn Validation Epoch 6 Loss: 0.017119052413890383
Learning Rate Reduced to 0.008345653031794291
Training Epoch 7 Loss: 0.019716191441327535


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.02702179070758192
Validation Epoch         7 Loss: 0.0166790422052145
SKLearn Validation Epoch 7 Loss: 0.016679041849447313
Learning Rate Reduced to 0.0075
Training Epoch 8 Loss: 0.018817411102101105


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.025560638409549908
Validation Epoch         8 Loss: 0.01667683243751526
SKLearn Validation Epoch 8 Loss: 0.016676832266223324
Learning Rate Reduced to 0.006545084971874737
Training Epoch 9 Loss: 0.017633889896833167


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.024479331102886116
Validation Epoch         9 Loss: 0.016823675371706485
SKLearn Validation Epoch 9 Loss: 0.016823675265866495
Learning Rate Reduced to 0.005522642316338268
Training Epoch 10 Loss: 0.016228030828543665


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.023660864032772143
Validation Epoch         10 Loss: 0.01704765010625124
SKLearn Validation Epoch 10 Loss: 0.017047650152906563
Learning Rate Reduced to 0.004477357683661734
Training Epoch 11 Loss: 0.014562191986510542


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.023055128073171963
Validation Epoch         11 Loss: 0.017555045560002328
SKLearn Validation Epoch 11 Loss: 0.017555045388779854
Learning Rate Reduced to 0.003454915028125264
Training Epoch 12 Loss: 0.01248985511839998


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.02258995004563125
Validation Epoch         12 Loss: 0.01790095552802086
SKLearn Validation Epoch 12 Loss: 0.01790095572890346
Learning Rate Reduced to 0.0025000000000000014
Training Epoch 13 Loss: 0.010421988123482993


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.02224330022462354
Validation Epoch         13 Loss: 0.018402420207858084
SKLearn Validation Epoch 13 Loss: 0.01840242027986453
Learning Rate Reduced to 0.0016543469682057106
Training Epoch 14 Loss: 0.008728968720807105


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.021969345942230763
Validation Epoch         14 Loss: 0.018659978210926055
SKLearn Validation Epoch 14 Loss: 0.01865997817987105
Learning Rate Reduced to 0.0009549150281252633
Training Epoch 15 Loss: 0.007642239563445858


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.02174785719464787
Validation Epoch         15 Loss: 0.01885078437626362
SKLearn Validation Epoch 15 Loss: 0.018850784445394047
Learning Rate Reduced to 0.0004322727117869951
Training Epoch 16 Loss: 0.007090769318521631


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.021561938523258075
Validation Epoch         16 Loss: 0.01894420363008976
SKLearn Validation Epoch 16 Loss: 0.018944203548227866
Learning Rate Reduced to 0.00010926199633097157
Training Epoch 17 Loss: 0.006842715820918481


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.021398267263907995
Validation Epoch         17 Loss: 0.018930104672908784
SKLearn Validation Epoch 17 Loss: 0.01893010484008929
Learning Rate Reduced to 0.0
Training Epoch 18 Loss: 0.006757365207054785


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

Average Validation Loss for Fold 0: 0.021255713920228934
Validation Epoch         18 Loss: 0.01896345615386963
SKLearn Validation Epoch 18 Loss: 0.018963456256933353
Learning Rate Increased to 0.00010926199633097155
Training Epoch 19 Loss: 0.006766599929276981

Average Training Loss for Fold           0: 0.017826823619607628
Average Validation Loss for Fold         0: 0.021255713920228934
Average SKLearn Validation Loss for Fold 0: 0.021255713870134797
Best Validation Loss for Fold            0: 0.01667683243751526
Best Model for Fold #0 saved at D:\Kevin\Machine Learning\MoA Prediction\notebooks\weights\MoANet2\moa-net=MoANet2-fold=0-epoch=006-val_loss=0.01422.ckpt.


NameError: name 'hello_world' is not defined

In [None]:
print(metrics_and_models['MoANet2']['fold_metrics'])

In [None]:
trainer.callback_metrics

In [None]:
predictions

In [None]:
avg_validation_loss = 0.0
for fold in folds:
    print(f"Validation Loss for Fold {fold}: {fold_metrics[fold]}")
    avg_validation_loss += fold_metrics[fold]

avg_validation_loss /= FOLDS
print(f"Average Validation Loss: {avg_validation_loss}")

In [None]:
# Simple : 0.022132062166929246
# PCA + RobustScaler : 0.022057733684778213
# PCA + RobustScaler + Variance Thresholding : 0.02237522266805172
# Quantile + PCA + Variance Thresholding + MoANet2 : 0.02158106155693531
# Quantile + PCA + Variance Thresholding + MoANet2 : 0.02375772302704198
# No Shuffle + TablarNet + PCA : CV = 0.015230209566652775 | LB = 0.01926
# No Shuffle + Logistic Regression + PCA : CV = 0.014702482149004936 | 

In [None]:
# predictions = np.zeros((test_f.shape[0], len(TARGET_COLUMNS)))

# for fold in folds:
#     predictions += (fold_models[fold](torch.tensor(test_f[:, 1:].astype("float"))).sigmoid().detach().cpu().numpy() / FOLDS)
    
#     fold_metrics.append(metrics_callback.metrics)
#     fold_models.append(model)
    
# columns = ["sig_id"]
# columns.extend(TARGET_COLUMNS)
# predictions = pd.DataFrame(np.concatenate((test_f[:, 0].reshape(-1, 1), predictions), axis=1), columns=columns)
# predictions.to_csv('submission.csv', index=False)

In [None]:
a = None
b = np.array([1, 2, 3, 4, 5, 6]).reshape(2, 3)
c = a + b
c

In [None]:
TARGET_COLUMNS

In [None]:
# submissions = pd.read_csv("submission.csv")
# submissions.iloc[:, 1:] = submissions.iloc[:, 1:].astype("float")
# submissions

In [None]:
# sample_submissions = pd.read_csv(SAMPLE_SUBMISSION)
# sample_submissions.columns == submissions.columns

In [None]:
# sample_submissions.dtypes

In [None]:
# submissions.dtypes

In [None]:
a = np.random.randn(4, 5)
b = np.random.randn(4, 5)
b += a / FOLDS
b

In [None]:
a.long()

In [None]:
# check: https://www.kaggle.com/bootiu/moa-pytorch-lightning-baseline