# Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('/home/dg/fastai'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [75]:
from fastai.text import *
from fastai.imports import *
from fastai.structured import *
from fastai.column_data import *
from torch.nn import functional as F
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
import gc
import pdb

# Data processing func

In [4]:
def scale_vars(df, mapper, scale_col_exc):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n]) and n not in scale_col_exc]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper

In [5]:
def proc_df2(df, y_fld = None, skip_flds=None, do_scale=True, scale_col_exc = None, na_dict=None,
             preproc_fn=None, max_n_cat=10, subset=None, mapper=None):
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    if y_fld is not None: 
        y = df[y_fld].values
        df.drop(skip_flds+[y_fld], axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: mapper = scale_vars(df, mapper, scale_col_exc)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    if y_fld is not None: 
        res = [pd.get_dummies(df, dummy_na=True), y, na_dict]
    else:
        res = [pd.get_dummies(df, dummy_na=True), na_dict]
    if do_scale: res = res + [mapper]
    return res

In [9]:
def df_to_maxtrix(df, cols = cols):    
    return df[cols].values.astype(np.float32)

# app train

In [10]:
df_train = pd.read_csv('data/application_train.csv')
train_cats(df_train)

In [11]:
y_label = df_train.TARGET.values
df_train.drop(columns = 'TARGET', axis=1, inplace=True)

In [12]:
df_train, _, _ = proc_df2(df_train,  do_scale=True, scale_col_exc = ['SK_ID_CURR'])

In [13]:
sk_id_curr = df_train.SK_ID_CURR.values

In [14]:
x_train,x_valid, y_train, y_valid = train_test_split(sk_id_curr, y_label, test_size=0.2, stratify = y_label)

In [15]:
cols = [i for i in df_train.columns if i not in ['SK_ID_CURR','SK_ID_PREV']]
df_train_group = df_train.groupby(['SK_ID_CURR']).apply(lambda x: df_to_maxtrix(x, cols))

# cc train

In [16]:
cc_train = pd.read_csv('data/credit_card_balance.csv')
train_cats(cc_train)

In [17]:
cc_train.drop(columns = 'SK_ID_PREV', axis=1, inplace=True)

In [18]:
cc_train, _ = proc_df2(cc_train, do_scale = False, scale_col_exc = ['SK_ID_CURR'])

In [19]:
cc_sk_id = set(cc_train['SK_ID_CURR'].values)

In [20]:
cc_train = cc_train.sort_values(by = ['SK_ID_CURR', 'MONTHS_BALANCE'])

In [21]:
cols = [i for i in cc_train.columns if i not in ['SK_ID_CURR','SK_ID_PREV']]
cc_train_group = cc_train.groupby(['SK_ID_CURR']).apply(lambda x: df_to_maxtrix(x, cols))

In [448]:
cc_train_group[100006][:,:10]

array([[    -6. ,      0. , 270000. ,      0. ,      0. ,      0. ,      0. ,      0. ,   2702.7,      0. ],
       [    -5. ,      0. , 270000. ,      0. ,      0. ,      0. ,      0. ,      0. ,   2702.7,      0. ],
       [    -4. ,      0. , 270000. ,      0. ,      0. ,      0. ,      0. ,      0. ,   2702.7,      0. ],
       [    -3. ,      0. , 270000. ,      0. ,      0. ,      0. ,      0. ,      0. ,   2702.7,      0. ],
       [    -2. ,      0. , 270000. ,      0. ,      0. ,      0. ,      0. ,      0. ,   2702.7,      0. ],
       [    -1. ,      0. , 270000. ,      0. ,      0. ,      0. ,      0. ,      0. ,   2702.7,      0. ]],
      dtype=float32)

# Data loader

## function

In [65]:
class SortishSamplerRNN(Sampler):
    def __init__(self, data_source, key, cc_sk_id, bs):
        self.data_source, self.key, self.cc_sk_id, self.bs = data_source, key, cc_sk_id,bs
    
    def add_id(self, i): return i if len(i)%self.bs == 0 else i+[i[-1]]*(self.bs-len(i)%self.bs)
    
    def len_cal(self, i): 
        return self.key[self.data_source[i]].shape[0] if self.data_source[i] in self.cc_sk_id else 0

    def __len__(self): return len(self.data_source)

    def __iter__(self):        
        d={}
        for i in range(len(self.data_source)): d.setdefault(self.len_cal(i), []).append(i)        
        result = [d[n] for n in sorted(d, reverse=True)]
        #pdb.set_trace()
        #result = [self.add_id(i) for i in result]
        result = [i for j in result for i in j]
        return iter(result)

In [23]:
class ColumnarRNNDataset(Dataset):
    def __init__(self, df, y):
        n = len(df)
        self.df = df
        self.y = np.zeros((n,1)) if y is None else y

    def __len__(self): return len(self.y)

    def __getitem__(self, idx): return [self.df[idx], self.y[idx]] 

    @classmethod
    def from_data_frames(cls,df, y=None): return cls(df, y) 

    @classmethod
    def from_data_frame(cls, df, y=None): return cls.from_data_frames(df,y)

In [24]:
class ColumnarRNNModelData(ModelData):
    def __init__(self, path, trn_ds, val_ds, bs, tr_sampler = None, val_sampler = None, test_ds=None, shuffle=False):        
        trn_dl = DataLoader(trn_ds, bs, shuffle=shuffle, sampler = tr_sampler, num_workers=1) #
        val_dl = DataLoader(val_ds, bs, shuffle=shuffle, sampler = val_sampler, num_workers=1) #, 
        test_dl = DataLoader(test_ds, bs, shuffle=False, num_workers=1) if test_ds is not None else None
        super().__init__(path, trn_dl, val_dl, test_dl)

    @classmethod
    def from_data_frames(cls, path, 
                         trn_df, val_df, trn_y, val_y, 
                         tr_sampler = None, val_sampler = None,
                         bs=64, test_df=None, shuffle=False):
        trn_ds  = ColumnarRNNDataset.from_data_frame(trn_df,trn_y)
        val_ds  = ColumnarRNNDataset.from_data_frame(val_df, val_y)
        test_ds = ColumnarRNNDataset.from_data_frame(test_df, None) if test_df is not None else None
        return cls(path, 
                   trn_ds, val_ds, bs, test_ds=test_ds, 
                   tr_sampler = tr_sampler, val_sampler = val_sampler, 
                   shuffle=shuffle)

## build

In [66]:
bs = 64; PATH = './'

In [67]:
trn_samp = SortishSamplerRNN(x_train, cc_train_group, cc_sk_id, bs=bs)
val_samp = SortishSamplerRNN(x_valid, cc_train_group, cc_sk_id, bs=bs)

In [68]:
md  = ColumnarRNNModelData.from_data_frames(PATH, 
                                            trn_df = x_train, val_df = x_valid, 
                                            trn_y = y_train, val_y = y_valid,
                                            tr_sampler = trn_samp, val_sampler = val_samp,
                                            shuffle=False, bs=bs)

# Learner

## Class

In [144]:
class RNNModel(nn.Module):
    def __init__(self, app_ref, cc_ref, cc_sk_id, szs, app_drop, cat_drop, rnn_drop, lin_drops, bs):        
        super().__init__()
        self.bs, self.cc_sk_id = bs, cc_sk_id       
        self.app_ref, self.cc_ref = app_ref, cc_ref
        
        szs = [309] + szs
        self.rnn = nn.GRU(input_size = 37, hidden_size = 64, num_layers = 2, dropout=rnn_drop)
        
        #linear layer
        self.lins = nn.ModuleList([nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        for o in self.lins: kaiming_normal(o.weight.data)
        self.l_outp = nn.Linear(szs[-1], 1)
        kaiming_normal(self.l_outp.weight.data)
        #batchnorm layer
        self.bns_app = nn.BatchNorm1d(245)
        self.bns_lins = nn.ModuleList([nn.BatchNorm1d(sz) for sz in szs[1:]])
        #dropout
        self.app_drop = nn.Dropout(app_drop)
        self.cat_drop = nn.Dropout(cat_drop)
        self.drops_lins = nn.ModuleList([nn.Dropout(drop) for drop in lin_drops])
        
        self.zeros = V(torch.zeros(1, 37))

    def forward(self, x_in):
        x_inp = x_in.cpu().data.numpy()
        
        app_input = torch.stack([V(i[0]) for i in self.app_ref[x_inp]])
        app_input = self.app_drop(self.bns_app(app_input))
        
        self.rnn.flatten_parameters()
        cc_inp = [V(self.cc_ref[i]) if i in self.cc_sk_id else self.zeros for i in x_inp]
        lengths = [i.size()[0] for i in cc_inp] 
        max_length = cc_inp[0].size()[0]
        pad_inp = torch.stack([nn.ConstantPad2d((0,0,0,max_length-i.size()[0]), 0)(i) for i in cc_inp])
        cc_inp_pack = pack_padded_sequence(pad_inp, lengths, batch_first=True)
        packed_output, _ = self.rnn(cc_inp_pack)
        outp, _ = pad_packed_sequence(packed_output, batch_first=True)        
        
        x = self.cat_drop(torch.cat([app_input, outp[:,-1,:]], 1))
                                     
        for linear,drop_out,batch_norm in zip(self.lins, self.drops_lins, self.bns_lins): 
            x = drop_out(batch_norm(F.relu(linear(x))))
        x = F.sigmoid(self.l_outp(x))
        
        return x

In [76]:
def imbalance_loss(inp,targ):
    inp_flat = inp.view(-1); targ_flat = targ.float().view(-1)
    return F.binary_cross_entropy(inp_flat, targ_flat, targ_flat + 0.8)

## Build

In [145]:
m = RNNModel(app_ref = df_train_group, cc_ref = cc_train_group, 
             szs=  [400, 200, 100], 
             lin_drops = [0.15, 0.15, 0.25],
             app_drop = 0.15, cat_drop = 0.15, rnn_drop = 0.2,
             cc_sk_id = cc_sk_id, bs = bs)

In [146]:
RNN_learner = Learner.from_model_data(m, md)
RNN_learner.crit = imbalance_loss

## fit

In [147]:
lr = 5e-2

In [148]:
RNN_learner.fit(lr, n_cycle = 5, cycle_len = 3, cycle_mult=1, use_wd_sched=True, wds=1e-5)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                 
    0      0.351355   0.353917  
    1      0.349395   0.352241                                 
    2      0.348675   0.35142                                  
    3      0.348795   0.354273                                 
    4      0.345298   0.351427                                 
    5      0.347809   0.350964                                 
    6      0.346332   0.353053                                 
    7      0.347266   0.352368                                 
    8      0.346588   0.35072                                  
    9      0.341352   0.351263                                 
    10     0.344679   0.352288                                 
    11     0.339111   0.350885                                 
    12     0.3464     0.352109                                 
    13     0.34068    0.35142                                  
    14     0.340474   0.349628                                 


[array([0.34963])]