In [1]:
import torch
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from torch_geometric.loader import DataLoader


In [2]:
features_friends = pd.read_csv("../data/FINAL_FEATURES_FRIENDS.tsv", sep="\t")
target_traintest = pd.read_csv("../data/FINAL_TARGETS_DATES_TRAINTEST.tsv", sep="\t")
sequences_matrix = pd.read_csv("../data/FINAL_SEQUENCES_MATRIX.tsv", sep="\t")
targets_traintest = pd.read_csv("../data/FINAL_TARGETS_DATES_TRAINTEST.tsv", sep="\t")

In [3]:
targets_train=targets_traintest.loc[targets_traintest.TARGET!='test']

In [4]:
tmp_df = features_friends.merge(targets_train, on='CLIENT_ID', how = 'left')


In [5]:
tmp_df.dropna(inplace=True)

In [6]:
tmp_df['TARGET'] = tmp_df['TARGET'].astype(int)

In [7]:
features_columns = tmp_df.drop(['CLIENT_ID', 'FRIEND_ID', 'RETRO_DT','TARGET'], axis = 1).columns
clients_ids = pd.unique(tmp_df['CLIENT_ID'])

In [20]:
import torch.nn.functional as F


class Graphs:
    
    def __init__(self,clients_ids, df,future_columns, test = False):
        self.clients_ids = clients_ids
        self.df = df
        self.future_columns = future_columns
        self.test = test
    
    def normalize(self):
        pass
    
    def __len__(self):
        return len(self.clients_ids)
    
    
    def __getitem__(self, index):
        series = self.df.loc[self.df['CLIENT_ID'] == self.clients_ids[index]]
        mean = np.array(Graphs.get_mean(series[self.future_columns].values.T)).reshape(1,-1)
        if self.test:
            labels = 0
        else:
            labels = series['TARGET'].iloc[0]
            labels = np.eye(2, dtype=int)[labels]
        arrays = series[self.future_columns]
        edge_idx = Graphs.build_edge_idx(len(arrays)-1)
        if self.test:
            arrays = arrays.values
        else:
            arrays = np.concatenate((arrays.values,mean))
        arrays = torch.from_numpy(arrays).to(torch.long)
        data = Data(x = arrays, y = torch.Tensor(labels).to(torch.long), edge_index = Graphs.build_edge_idx(len(arrays)))
        return data
                    
                    
    @staticmethod
    def build_edge_idx(num_nodes):
    # Initialize edge index matrix
        E = torch.zeros((2, num_nodes * (num_nodes - 1)), dtype=torch.long)

        # Populate 1st row
        for node in range(num_nodes):
            for neighbor in range(num_nodes - 1):
                E[0, node * (num_nodes - 1) + neighbor] = node

        # Populate 2nd row
        neighbors = []
        for node in range(num_nodes):
            neighbors.append(list(np.arange(node)) + list(np.arange(node+1, num_nodes)))
        E[1, :] = torch.Tensor([item for sublist in neighbors for item in sublist])

        return E
    
    @staticmethod
    def get_mean(vector_features):
        embeddings_mean = list()
        for item in vector_features:
            embeddings_mean.append(np.mean(item))
        return embeddings_mean
    
        
        
        
        
        
        
    

In [14]:
g = Graphs(clients_ids, tmp_df, features_columns)

In [15]:
g[0]

Data(x=[81, 1014], edge_index=[2, 6480], y=[2])

In [54]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GCNConv

class TwoLayerGCN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.1):
        super(TwoLayerGCN, self).__init__()

        self.conv1 = GCNConv(input_size, hidden_size)
        self.conv2 = GCNConv(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.sgm = nn.Sigmoid()

    def forward(self,data):
        x, ed_idx = data.x.float(), data.edge_index
        x = self.dropout(x)
        x = self.conv1(x, ed_idx)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, ed_idx)
        return self.sgm(x)


In [34]:
model_nn = TwoLayerGCN(1014,16,2)

In [507]:
from tqdm import tqdm
def train(model, DataClass, loss_fn, epochs = 10):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    train_loss = []
    for epoch in range(epochs):
        for index in range(len(DataClass)):
            optimizer.zero_grad()
            out = model(DataClass[index])
            loss = loss_fn(out[-1].float(), DataClass[index].y.float())
            loss.backward()
            train_loss.append(loss.item())
            optimizer.step()
        print(epoch)
    return model,train_loss

In [508]:
loss = nn.CrossEntropyLoss()

In [509]:
model,train_losses = train(model_nn, g , loss)

240.784423828125
-0.0
0.5522956252098083
0.6407848596572876
0.5657762885093689
0
0.5601586103439331
0.5433701872825623
0.45990580320358276
0.6327509880065918
0.5548149943351746
1
0.5505769848823547
0.539155900478363
0.4521910548210144
0.6317821145057678
0.5534007549285889
2
0.5493013858795166
0.5384495854377747
0.4570218324661255
0.638260543346405
0.5548508763313293
3
0.550449013710022
0.5386836528778076
0.4603298008441925
0.6410010457038879
0.5553633570671082
4
0.5508409142494202
0.5387226939201355
0.45702844858169556
0.6380674839019775
0.5543151497840881
5
0.5499603748321533
0.5383859276771545
0.45615842938423157
0.640110969543457
0.5510740876197815
6
0.547279417514801
0.5374847054481506
0.45580819249153137
0.632197380065918
0.5524407625198364
7
0.5484054088592529
0.5378463864326477
0.4585231840610504
0.6414981484413147
0.5513993501663208
8
0.5475409030914307
0.537548303604126
0.45436233282089233
0.6397098898887634
0.5545995235443115
9


In [510]:
torch.save(model.state_dict(), 'model.pt')



## CATBOOST

In [58]:
%%time
X = np.array([Graphs.get_mean(tmp_df.loc[tmp_df.CLIENT_ID == item][features_columns].values.T) for item in pd.unique(tmp_df.CLIENT_ID)])
y = np.array([tmp_df.loc[tmp_df.CLIENT_ID == item]['TARGET'].iloc[0] for item in pd.unique(tmp_df.CLIENT_ID)])

CPU times: user 2.99 s, sys: 3.87 ms, total: 3 s
Wall time: 3 s


In [59]:
from catboost import CatBoostClassifier

In [60]:
catboostmodel = CatBoostClassifier(learning_rate = 0.05, 
                                   n_estimators= 2000,
                                   max_depth=5, 
                                   verbose= False)

In [61]:

catboostmodel.fit(X,y)
        # models.append(model)

<catboost.core.CatBoostClassifier at 0x7f5e74a8aa30>

In [77]:
catboostmodel.save_model('catboost_model') 

# PREDICT

In [36]:
checkpoint = torch.load('model.pt')
model_nn.load_state_dict(checkpoint)


<All keys matched successfully>

In [37]:
model_nn

TwoLayerGCN(
  (conv1): GCNConv(1014, 16)
  (conv2): GCNConv(16, 2)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [62]:
targets_test=targets_traintest.loc[targets_traintest.TARGET == 'test']

In [63]:
merged = targets_test.merge(features_friends, on= 'CLIENT_ID', how = 'left')
merged.fillna(0)

Unnamed: 0,CLIENT_ID,RETRO_DT,TARGET,i1047,i1048,i1056,i1058,i1059,i1060,i1065,...,u8=18,u8=19,u8=2,u8=20,u8=21,u8=6,u8=7,u8=8,u8=9,FRIEND_ID
0,1025140,20210501,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1029732,20210501,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1079794,20210501,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1116331,20210501,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1136822,20210501,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39749,70294,20210731,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39750,71247,20210731,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39751,71657,20210731,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39752,72631,20210731,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
len(pd.unique(merged.CLIENT_ID))

31858

In [66]:
%%time
from tqdm import tqdm
X = np.array([Graphs.get_mean(merged.loc[merged.CLIENT_ID == item][features_columns].values.T) for item in tqdm(pd.unique(merged.CLIENT_ID))])


100%|██████████| 31858/31858 [03:19<00:00, 159.57it/s]

CPU times: user 3min 20s, sys: 1.41 s, total: 3min 21s
Wall time: 3min 21s





In [15]:
import gc

In [42]:
len(GTest)

31858

In [41]:
GTest = Graphs(pd.unique(merged.CLIENT_ID), merged, future_columns = features_columns, test = True)

In [38]:
def predict(model, GraphX):
    predicted = []
    model.eval()
    with torch.no_grad():
        for i in range(len(GraphX)):
            output = model(GraphX[i])[-1,1]
            predicted.append(output.item())
    return predicted

In [44]:
predict_nn = predict(model= model_nn, GraphX = GTest)

In [51]:
import math

def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [52]:
predict_nn = [sigmoid(item) for item in predict_nn]

In [69]:
ensemble_predict = [(predict_nn[i] * 0.4) + (predict_catboost[i] * 0.6) for i in range(len(predict_catboost))]

In [67]:
predict_catboost = catboostmodel.predict_proba(X)

In [68]:
predict_catboost = predict_catboost[:,1]

In [71]:
sub = pd.DataFrame()
sub['CLIENT_ID'] = pd.unique(merged.CLIENT_ID)

In [74]:
sub['TARGET'] = ensemble_predict

In [76]:
sub.to_csv('Submit.csv', index = False)