In [49]:
import os
import gc
import itertools

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import random
import collections
import time

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
import random

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
seed_everything(19)

#Preprocessing

This preprocessing method is more careful with RAM usage, which avoids crashing the kernel when you switch from CPU to GPU.

In [6]:
%%time
train_transaction = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print(train.shape)
print(test.shape)

y_train = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity
gc.collect()

# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

del train, test
gc.collect()

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))   

(590540, 433)
(506691, 432)
CPU times: user 1min 35s, sys: 12.9 s, total: 1min 48s
Wall time: 1min 48s


# RAM optimization

In [7]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.fit_transform(X_test)
del X_train, X_test
gc.collect()

21

In [64]:
y_train.reset_index(drop = True, inplace=True)

In [34]:
"""%%time
X_train_norm = reduce_mem_usage(X_train_norm)
X_test_norm = reduce_mem_usage(X_test_norm)"""

'%%time\nX_train_norm = reduce_mem_usage(X_train_norm)\nX_test_norm = reduce_mem_usage(X_test_norm)'

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [91]:
class GetData(Dataset):
    def __init__(self, datatype = "train"):
        self.datatype = datatype
        if self.datatype == "train":
            self.labels = torch.FloatTensor(y_train.values).reshape(-1,1)
            self.features = torch.FloatTensor(X_train_norm)
        else:
            self.labels = np.zeros((X_test_norm.shape[0],1))
            self.features = torch.FloatTensor(X_test_norm)
            
    def __len__(self):
        return len(self.features)
    
    
    def __getitem__(self, idx):
        row = self.features[idx]
        label = self.labels[idx]
        
        return row, label
        

In [92]:
dataset = GetData(datatype = "train")
test_set = GetData(datatype = "test")
tr, val = train_test_split(y_train, stratify= y_train, test_size = 0.2)
train_sampler = SubsetRandomSampler(list(tr.index))
valid_sampler = SubsetRandomSampler(list(val.index))
batch_size = 500
num_workers = 0

train_loader = torch.utils.data.DataLoader(dataset, batch_size = batch_size, 
                                           sampler = train_sampler,
                                           num_workers = num_workers )
valid_loader = torch.utils.data.DataLoader(dataset, batch_size = batch_size, 
                                           sampler = valid_sampler,
                                           num_workers = num_workers )
test_loader = torch.utils.data.DataLoader(test_set, batch_size = batch_size, 
                                          num_workers = num_workers)

In [90]:
y_train.shape

(590540,)

In [86]:
class NeuralFraud(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(X_train_norm.shape[1], 500)
        self.relu1 = nn.ReLU()
        self.dp1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(500, 2048)
        self.relu2 = nn.ReLU()
        self.dp2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(2048, 512)
        self.relu3 = nn.ReLU()
        self.dp3 = nn.Dropout(0.2)
        self.fc4 = nn.Linear(512, 64)
        self.relu4 = nn.ReLU()
        self.dp4 = nn.Dropout(0.2)
        self.last_linear = nn.Linear(64, 1)
        self.out_act = nn.Sigmoid()
        
        
    def forward(self, input):
        a1 = self.fc1(input)
        h1 = self.relu1(a1)
        drp1 = self.dp1(h1)
        a2 = self.fc2(drp1)
        h2 = self.relu2(a2)
        drp2 = self.dp2(h2)
        a3 = self.fc3(drp2)
        h3 = self.relu3(a3)
        drp3 = self.dp3(h3)
        a4 = self.fc4(drp3)
        h4 = self.relu4(a4)
        drp4 = self.dp4(h4)
        a5 = self.last_linear(drp4)
        y = self.out_act(a5)
        return y
    
    
    def predict(self, x):
        
        pred = F.softmax(self.forward(x))
        return torch.tensor(pred)
      
        
        
        

In [97]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, n_epochs=15):
    model.to(device)
    valid_loss_min = np.Inf
    patience = 5
    # current number of epochs, where validation loss didn't increase
    p = 0
    # whether training should be stopped
    stop = False

    # number of epochs to train the model
    for epoch in range(1, n_epochs+1):
        print(time.ctime(), 'Epoch:', epoch)

        train_loss = []
        train_auc = []

        for batch_i, (data, target) in enumerate(train_loader):

            data, target = data.cuda(), target.cuda()

            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target.float())
            train_loss.append(loss.item())

            a = target.data.cpu().numpy()
            b = output[:,-1].detach().cpu().numpy()
            #train_auc.append(roc_auc_score(a, b))
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = []
        val_auc = []
        for batch_i, (data, target) in enumerate(valid_loader):
            data, target = data.cuda(), target.cuda()
            output = model(data)

            loss = criterion(output, target.float())

            val_loss.append(loss.item()) 
            a = target.data.cpu().numpy()
            b = output[:,-1].detach().cpu().numpy()
            #val_auc.append(roc_auc_score(a, b))

        # print(f'Epoch {epoch}, train loss: {np.mean(train_loss):.4f}, valid loss: {np.mean(val_loss):.4f}, train auc: {np.mean(train_auc):.4f}, valid auc: {np.mean(val_auc):.4f}')
        print(f'Epoch {epoch}, train loss: {np.mean(train_loss):.4f}, valid loss: {np.mean(val_loss):.4f}.')

        valid_loss = np.mean(val_loss)
        scheduler.step(valid_loss)
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model.state_dict(), 'model.pt')
            valid_loss_min = valid_loss
            p = 0

        # check if validation loss didn't improve
        if valid_loss > valid_loss_min:
            p += 1
            print(f'{p} epochs of increasing val loss')
            if p > patience:
                print('Stopping training')
                stop = True
                break        

        if stop:
            break
    return model

In [143]:
model = NeuralFraud()
loss = nn.MSELoss()
#loss = nn.BCELoss()
lr = 0.001
n_epochs = 50
optimizer = optim.SGD(model.parameters(), lr = lr, momentum = 0.99)
#optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size = 3, gamma = 0.1)
#scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.8, patience=2)

In [141]:
trained_model = train_model(model, train_loader, valid_loader, criterion = loss, 
                              optimizer = optimizer, n_epochs=n_epochs)

Sun Sep  8 17:49:54 2019 Epoch: 1
Epoch 1, train loss: 0.1209, valid loss: 0.1085.
Validation loss decreased (inf --> 0.108524).  Saving model ...
Sun Sep  8 17:50:04 2019 Epoch: 2
Epoch 2, train loss: 0.1059, valid loss: 0.1024.
Validation loss decreased (0.108524 --> 0.102445).  Saving model ...
Sun Sep  8 17:50:13 2019 Epoch: 3
Epoch 3, train loss: 0.1021, valid loss: 0.1014.
Validation loss decreased (0.102445 --> 0.101357).  Saving model ...
Sun Sep  8 17:50:22 2019 Epoch: 4
Epoch 4, train loss: 0.0997, valid loss: 0.0959.
Validation loss decreased (0.101357 --> 0.095889).  Saving model ...
Sun Sep  8 17:50:31 2019 Epoch: 5
Epoch 5, train loss: 0.0978, valid loss: 0.0957.
Validation loss decreased (0.095889 --> 0.095653).  Saving model ...
Sun Sep  8 17:50:40 2019 Epoch: 6
Epoch 6, train loss: 0.0956, valid loss: 0.0944.
Validation loss decreased (0.095653 --> 0.094377).  Saving model ...
Sun Sep  8 17:50:50 2019 Epoch: 7
Epoch 7, train loss: 0.0939, valid loss: 0.0973.
1 epochs o

In [132]:
def test_model(model, test_loader):
    result = np.array([])
    sub = sample_submission
    model.eval()
    for (data, target) in test_loader:
        data = data.cuda()
        output = model(data)
        output = output.cpu().detach().numpy()
        result = np.concatenate((result, output), axis=None)
    print( "done")
    return result

In [133]:
output = test_model(model, test_loader)

done


In [135]:
sample_submission['isFraud'] = output
sample_submission.to_csv('NNfraud.csv')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
