reference:https://arxiv.org/pdf/1708.05123.pdf

In [1]:
import pandas as pd
import sys

In [2]:
pd.set_option('display.max_column', 100)
pd.set_option('display.max_row', 100)

In [3]:
sys.path.append('../../../fastai/')
sys.path.append('../../')

In [4]:
# xgboost
from xgb_learn.learner import *
from xgb_learn.dataset import *
from eda.simple import *

In [5]:
# embeddings
from fastai.column_data import *
from fastai.structured import *

### Batch Outer Product

In [38]:
bs = 5 # batch size
p = 10 # vec dimension
u0 = torch.ones(bs, p)

u1 = torch.bmm(V(u0.unsqueeze(2)), V(u0.unsqueeze(1)))

w1 = nn.Linear(p, 1).cuda()

u1.size()

type(w1(V(u1))), type(u0.unsqueeze(2))

torch.add(w1(V(u1)), V(u0.unsqueeze(2)))

### EMBEDDING MODEL

In [6]:
torch.cuda.set_device(0)

In [None]:
train_small_data = pd.read_feather("../../../data/talking/train_small_data.feather")
val_small_data = pd.read_feather("../../../data/talking/val_small_data.feather")

In [15]:
# take samples from train small data and val small data
sample_ratio =  0.03
np.random.seed(7)
train_idx = np.random.choice(train_small_data.index, size=int(len(train_small_data)*sample_ratio), replace=False)
val_idx = np.random.choice(val_small_data.index, size=int(len(val_small_data)*sample_ratio), replace=False)

# create sample data
train_small_sample = train_small_data.iloc[train_idx].reset_index(drop=True, inplace=False)
val_small_sample = val_small_data.iloc[val_idx].reset_index(drop=True, inplace=False)

In [7]:
full_data = pd.read_feather("../../../data/talking/nn_small_sample_full_data.feather")

In [16]:
train_nrows, val_nrows, test_nrows = (1916846, 687869, 18790469)

In [17]:
trn_df = full_data[:train_nrows]
val_df = full_data[train_nrows:train_nrows+val_nrows]
test_df = full_data[-test_nrows:]

In [18]:
trn_y = train_small_sample.is_attributed
val_y = val_small_sample.is_attributed # from data frames expects index 0,1, ...

In [19]:
val_df.reset_index(drop=True, inplace=True)
val_y.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [24]:
cats = ['ip', 'app', 'device', 'os', 'channel',
'click_timeDay', 'click_timeHour']

# get cat sizes
cat_sz = [(c, len(full_data[c].unique())) for c in cats]

# create embedding sizes
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

# nconts
n_conts = len(full_data.columns) - len(cats)

In [25]:
### Changed Validation Batch Size to be large enough for roc auc calculation

class ColumnarModelData(ModelData):
    def __init__(self, path, trn_ds, val_ds, bs, test_ds=None, shuffle=True):
        test_dl = DataLoader(test_ds, bs, shuffle=False, num_workers=1) if test_ds is not None else None
        super().__init__(path, DataLoader(trn_ds, bs, shuffle=shuffle, num_workers=1),
            DataLoader(val_ds, bs*50, shuffle=False, num_workers=1), test_dl) # increased validation batch size

    @classmethod
    def from_arrays(cls, path, val_idxs, xs, y, is_reg=True, is_multi=False, bs=64, test_xs=None, shuffle=True):
        ((val_xs, trn_xs), (val_y, trn_y)) = split_by_idx(val_idxs, xs, y)
        test_ds = PassthruDataset(*(test_xs.T), [0] * len(test_xs), is_reg=is_reg, is_multi=is_multi) if test_xs is not None else None
        return cls(path, PassthruDataset(*(trn_xs.T), trn_y, is_reg=is_reg, is_multi=is_multi),
                   PassthruDataset(*(val_xs.T), val_y, is_reg=is_reg, is_multi=is_multi),
                   bs=bs, shuffle=shuffle, test_ds=test_ds)

    @classmethod
    def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, is_reg, is_multi, test_df=None):
        test_ds = ColumnarDataset.from_data_frame(test_df, cat_flds, None, is_reg, is_multi) if test_df is not None else None
        return cls(path, ColumnarDataset.from_data_frame(trn_df, cat_flds, trn_y, is_reg, is_multi),
                    ColumnarDataset.from_data_frame(val_df, cat_flds, val_y, is_reg, is_multi), bs, test_ds=test_ds)

    @classmethod
    def from_data_frame(cls, path, val_idxs, df, y, cat_flds, bs, is_reg=True, is_multi=False, test_df=None):
        ((val_df, trn_df), (val_y, trn_y)) = split_by_idx(val_idxs, df, y)
        return cls.from_data_frames(path, trn_df, val_df, trn_y, val_y, cat_flds, bs, is_reg, is_multi, test_df=test_df)

    def get_learner(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                    y_range=None, use_bn=False, **kwargs):
        model = MixedInputModel(emb_szs, n_cont, emb_drop, out_sz, szs, drops, y_range, use_bn, self.is_reg, self.is_multi)
        return StructuredLearner(self, StructuredModel(to_gpu(model)), opt_fn=optim.Adam, **kwargs)


### ???

In [26]:
class CrossDenseNN(nn.Module):
    #https://arxiv.org/pdf/1708.05123.pdf
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops, cross_depth=6,
                 y_range=None, use_bn=False, is_reg=True, is_multi=False):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont=n_emb, n_cont
        self.cross_depth = cross_depth
        
        szs = [n_emb+n_cont] + szs
        self.szs = szs
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]])
        for o in self.lins: kaiming_normal(o.weight.data)
        self.outp = nn.Linear(szs[-1], out_sz)
        kaiming_normal(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn,self.y_range = use_bn,y_range
        self.is_reg = is_reg
        self.is_multi = is_multi

    def forward(self, x_cat, x_cont):
        if not hasattr(self, "lins2"):
            bs, _ = x_cat.size()
            self.lins2 = nn.ModuleList([nn.Linear(self.n_emb + self.n_cont, 1).cuda()
                          for i in range(self.ross_depth)])
            self.l_out = nn.Linear((self.n_emb + self.n_cont) + self.szs[-1], 2).cuda()
            
        
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
            
        # DNN    
        x_dnn = x
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x_dnn = F.relu(l(x_dnn))
            if self.use_bn: x = b(x)
            x_dnn = d(x_dnn) 
        
        # CROSS NN
        xl = x
        x0 = x
        for l in self.lins2:
            xl = l(torch.bmm(x0.unsqueeze(2), xl.unsqueeze(1))).squeeze() + xl # bs x p 
            
            
        return F.log_softmax(self.l_out(torch.cat([x_dnn, xl], 1)))

In [27]:
# initialize model
model = CrossDenseNN(emb_szs,
                    n_cont=n_conts,
                    emb_drop=0.5,
                    out_sz=2,
                    szs=[500,500],
                    drops=[0.5, 0.5],
                    is_reg=False,
                    is_multi=False).cuda()


bm = BasicModel(model, 'binary_classifier')

# initialize model data
md = ColumnarModelData.from_data_frames('/tmp',
                                        trn_df,
                                        val_df,
                                        trn_y,
                                        val_y,
                                        cats,
                                        128, False, False, test_df=test_df)

# initialize learner
learn = StructuredLearner(md, bm)

In [None]:
learn.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

 89%|████████▉ | 13396/14976 [08:34<01:00, 26.04it/s, loss=0.0164] 

In [None]:
learn.sched.plot()

In [None]:
learn.metrics = [accuracy, auc]

In [None]:
# baseline accuracy
all_zeros = 1 - val_y.mean()
all_zeros

In [None]:
# auc metric
def auc(preds, targs):
    score = roc_auc_score(to_np(targs), to_np(preds)[:, 1])
    return score

In [None]:
lr = 4e-3
learn.fit(lr, 20)

In [None]:
x1,x2,y=next(iter(md.val_dl))

In [None]:
out = learn.model(V(x1), V(x2))

### FFM