In [1]:
# load packages
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' 

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False

import random
random.seed(777)
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [2]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
sample = pd.read_csv('./data/sample_submission.csv')

In [3]:
train_data.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,target
0,1,Female,disloyal Customer,22,Business travel,Eco,1599,3,0,3,...,4,5,4,4,4,5,4,0,0.0,0
1,2,Female,Loyal Customer,37,Business travel,Business,2810,2,4,4,...,5,5,4,2,1,5,2,18,18.0,0
2,3,Male,Loyal Customer,46,Business travel,Business,2622,1,1,1,...,4,4,4,4,5,4,3,0,0.0,1
3,4,Female,disloyal Customer,24,Business travel,Eco,2348,3,3,3,...,3,2,4,5,3,4,3,10,2.0,0
4,5,Female,Loyal Customer,58,Business travel,Business,105,3,3,3,...,4,4,4,4,4,4,5,0,0.0,1


In [4]:
X = train_data.drop(columns = ['id','target'], axis = 1)

In [5]:
X.shape

(3000, 22)

In [6]:
binary_obj_columns = ['Gender', 'Customer Type', 'Type of Travel']
numerical_columns = ['Age', 'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'Flight Distance']
multical_obj_columns = list(set(X.columns) - set(binary_obj_columns) - set(numerical_columns))

# Set Data

## Binary Data => label mapping

In [7]:
X_train_num = pd.DataFrame()
X_test_num = pd.DataFrame()

for col in binary_obj_columns : 
    map_dict = {key : num for num,key in enumerate(train_data[col].unique())}
    X_train_num[col] = train_data[col].map(map_dict)
    X_test_num[col] = test_data[col].map(map_dict)

In [8]:
X_train_num.head()

Unnamed: 0,Gender,Customer Type,Type of Travel
0,0,0,0
1,0,1,0
2,1,1,0
3,0,0,0
4,0,1,0


## Numerical Data => 4 Grouping

In [9]:
X_train_group = pd.DataFrame()
X_test_group = pd.DataFrame()

for col in numerical_columns :     
    X_train_num[col] = train_data[col]
    X_test_num[col] = test_data[col]
    
    data = train_data[col]
    _, bins = pd.qcut(data, 4, retbins=True, labels=False, duplicates='drop')
    X_train_group[col+'_group'] = train_data[col].apply(lambda x : sum([x >= a for a in bins]))
    X_test_group[col+'_group'] = test_data[col].apply(lambda x : sum([x >= a for a in bins]))

## Multical Data => Label Encoding & MTE

In [10]:
for col in multical_obj_columns : 
    map_dict = {key : num for num, key in enumerate(sorted(train_data[col].unique()))}
    X_train_group[col] = train_data[col].map(map_dict)
    X_test_group[col] = test_data[col].map(map_dict)

# Modeling

## Set Data Loader

In [11]:
num_cols = list(X_train_num.columns)
cat_cols = list(X_train_group.columns)

X_train = pd.concat([X_train_num, X_train_group], axis = 1)
X_test = pd.concat([X_test_num, X_test_group], axis = 1)
Y = train_data['target'].values

inp_oup_dims = [[x , x//2] for x in X_train[cat_cols].nunique()]

print(X_train.shape, X_test.shape, Y.shape)

(3000, 26) (2000, 26) (3000,)


In [12]:
class CustomDataset(Dataset) : 
    
    def __init__(self, x, y, cat_cols, num_cols) : 
        
        self.x_cat = x[cat_cols].copy().values.astype(np.int64)
        self.x_num = x[num_cols].copy().values.astype(np.float32)
        self.y = y.astype(np.float32)
        
    def __len__(self) : 
        return len(self.y)
    
    def __getitem__(self, idx) : 
        return self.x_cat[idx], self.x_num[idx], self.y[idx]

In [13]:
def return_dataloaders(batch_size, random_state = 0) :
    from sklearn.model_selection import train_test_split

    data_range = X_train_group.index

    train_idx, valid_idx = train_test_split(data_range, shuffle = True, stratify = Y, test_size = .5, random_state = random_state)

    X_tr,y_tr = X_train.iloc[train_idx],  Y[train_idx]
    X_val,y_val = X_train.iloc[valid_idx], Y[valid_idx]

    train_ds = CustomDataset(X_tr, y_tr, cat_cols, num_cols)
    valid_ds = CustomDataset(X_val, y_val, cat_cols, num_cols)

    train_dl = DataLoader(train_ds, batch_size=batch_size)
    valid_dl = DataLoader(valid_ds, batch_size=batch_size)

    dataloaders = {}
    dataloaders['train'] = train_dl
    dataloaders['valid'] = valid_dl

    return dataloaders

In [14]:
batch_size = 1024
dataloaders = return_dataloaders(batch_size)

## Set Model

In [15]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [16]:
class FC_Block(nn.Module) : 
    def __init__(self, inp_dim, out_dim) : 
        super(FC_Block, self).__init__()
        self.linear = nn.Linear(inp_dim, out_dim)
        self.batch = nn.BatchNorm1d(out_dim)
    
    def forward(self, x) : 
        x = self.linear(x)
        x = self.batch(x)
        return x

In [17]:
class EMBNN(nn.Module) : 
    def __init__(self, inp_oup_dims, num_continuous) : 
        super(EMBNN, self).__init__()
        
        self.embeddings = nn.ModuleList([
            nn.Embedding(inp_dim+1, out_dim) for inp_dim ,out_dim in inp_oup_dims])
        self.n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.emb_drop = nn.Dropout(0.3)
    
        self.cont_norm = nn.BatchNorm1d(num_continuous)
        self.n_con = num_continuous
        
        self.FFC = nn.Sequential(
                                                                FC_Block(self.n_emb + self.n_con, 32),
                                                                nn.Dropout(0.2),
                                                                FC_Block(32, 8),
                                                                nn.Dropout(0.2),
                                                                nn.Linear(8,1),
                                                                )
        
    def forward(self, x_cat, x_cont) : 
        x_cat = [e(x_cat[:,i]) for i , e in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, 1)
        x_cat = self.emb_drop(x_cat)
        x_cont = self.cont_norm(x_cont)
        x = torch.cat([x_cat, x_cont], 1)
        x = self.FFC(x)
        
        return F.sigmoid(x)

## Train

In [18]:
def train_model(model, dataloader, optimizer, criterion, num_epoch, early_stop, model_path) : 
    
    best_val_loss = np.float('inf')
    early_stop_epoch  = 0
    
    for epoch in range(num_epoch) : 
        for phase in ['train','valid'] : 
            if phase == 'train' : 
                model.train()
            elif phase == 'valid' : 
                model.eval()
            running_loss = 0
            running_corr = 0
            total = 0

            for x_cat, x_num ,y in dataloaders[phase] : 
                x_cat = x_cat.to(device)
                x_num = x_num.to(device)
                y = y.to(device)

                optimizer.zero_grad()
                total += x_cat.size(0)

                with torch.set_grad_enabled(phase == 'train') : 
                    output = model(x_cat, x_num)
                    loss =criterion(output.squeeze(), y)

                    if phase == 'train' : 
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item()
                running_corr += (output.round() == y.unsqueeze(1)).sum().item()

            epoch_loss = running_loss / total
            epoch_acc = running_corr/total


            if phase == 'valid' and epoch_loss < best_val_loss : 
                
                best_val_loss = epoch_loss
                best_acc = epoch_acc
                torch.save(model.state_dict(), model_path)
                early_stop_epoch = 0
                best_epoch = epoch

            elif phase == 'valid' : 
                early_stop_epoch += 1

        if (early_stop_epoch >= early_stop) or (epoch == num_epoch-1) : 
            "Early Stop Occured on epoch" + str(epoch)
            print(f'On Epoch {best_epoch}, Best Model Saved with Valid Loss {round(epoch_loss, 6)} and Acc {round(epoch_acc, 4)*100}%')
            break;
            
    model.load_state_dict(torch.load(model_path))
    return model

# Total

In [19]:
def predict(model) : 
    with torch.no_grad() : 
        test_cat = torch.LongTensor(X_test[cat_cols].values)
        test_num = torch.FloatTensor(X_test[num_cols].values)

        pred = model(test_cat, test_num).squeeze()
        
        return pred.cpu().detach().numpy()

In [20]:
def return_pred_with_random_state(rs) : 
    batch_size = 1024
    model =  EMBNN(inp_oup_dims, len(num_cols)).to(device)
    model.apply(weights_init)
    dataloader = return_dataloaders(batch_size, rs)
    optimizer = optim.Adam(model.parameters(), lr = 0.005)
    criterion = nn.BCELoss()
    

    trained_model = train_model(model, dataloader, optimizer, criterion, num_epoch = 300, early_stop = 10, model_path = 'EMBNN.pth')
    pred  = predict(trained_model)
    return pred

In [21]:
preds = []
for i in range(10) : 
    rs = np.random.randint(1,2**15)
    print(f'===ROUND {i} Initiated with rs {rs} === ', end = '\n\t')

    pred = return_pred_with_random_state(rs)
    preds.append(pred)
    print('')

===ROUND 0 Initiated with rs 6354 === 
	On Epoch 90, Best Model Saved with Valid Loss 0.000249 and Acc 92.86999999999999%

===ROUND 1 Initiated with rs 1200 === 
	On Epoch 71, Best Model Saved with Valid Loss 0.00025 and Acc 92.47%

===ROUND 2 Initiated with rs 6187 === 
	On Epoch 82, Best Model Saved with Valid Loss 0.000251 and Acc 92.27%

===ROUND 3 Initiated with rs 20466 === 
	On Epoch 64, Best Model Saved with Valid Loss 0.000272 and Acc 92.0%

===ROUND 4 Initiated with rs 25038 === 
	On Epoch 74, Best Model Saved with Valid Loss 0.000277 and Acc 91.2%

===ROUND 5 Initiated with rs 29534 === 
	On Epoch 56, Best Model Saved with Valid Loss 0.000279 and Acc 91.53%

===ROUND 6 Initiated with rs 25350 === 
	On Epoch 77, Best Model Saved with Valid Loss 0.000258 and Acc 92.13%

===ROUND 7 Initiated with rs 9292 === 
	On Epoch 78, Best Model Saved with Valid Loss 0.000267 and Acc 92.33%

===ROUND 8 Initiated with rs 16615 === 
	On Epoch 68, Best Model Saved with Valid Loss 0.000271 and

In [22]:
pred = np.mean(preds, axis = 0).round().astype(int)
sample['target'] = pred
sample.to_csv('./Jay Hong EMBEDNN PYTORCH.csv', index=False)