In [1]:
from io import StringIO
import pandas as pd
import boto3
import random
import missingno as msno
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder

import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn, optim

from imblearn.over_sampling import SMOTE
import torch.nn.functional as F

# TRAIN dataset

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
chunk_size = 100000
fileObj = pd.read_csv('train.csv', chunksize=chunk_size)
train_df = next(fileObj)
train_df

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_00000000,1,NSLHFNS,AVKQTCL,DTZFPRW,114.0,ISVXFVA,1,PQZBVMG,LPYPUNA,...,NZGEZLW,GTISJWW,380.0,2.0,AXQFZWC,IRUDRFB,,TFJMLCZ,0.0,AURZYDY
1,TRAIN_00000001,0,VGIVWZQ,LSUSMVO,PQGWFJZ,26.0,NFRVLWS,43,IMPIGJT,MIGYEEG,...,NZGEZLW,GTISJWW,466.0,1.0,DRVVDHZ,IRUDRFB,19.0,AUGTURV,0.0,LUZRMLU
2,TRAIN_00000002,0,JCDXFYU,PILDDJU,IAGJDOH,119.0,LFPUEOV,0,FFUTIRZ,OFKQGTY,...,VHXETCF,KHZNEZF,197.0,0.0,QMOULXS,IRUDRFB,8.0,ZVSTLNM,0.0,MHBRSQK
3,TRAIN_00000003,1,PSMFWTP,ZYAVJHP,,15.0,ATQPZSJ,26,ZDTZNSB,THBWWCD,...,IVIRTPR,GTISJWW,8640.0,0.0,IZLJUJS,IRUDRFB,14.0,ZBSRLCQ,0.0,GAZBSSZ
4,TRAIN_00000004,0,SLCRICD,QPQWGXA,,13.0,CHZGJZR,20,PQZBVMG,MIGYEEG,...,NZGEZLW,WHSRKIM,41774.0,0.0,BHBIZCL,IRUDRFB,13.0,QHYLSBX,0.0,QTATWAY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,TRAIN_00099995,0,JCDXFYU,PILDDJU,IAGJDOH,1.0,LFPUEOV,2,VGRLUPY,FTPHMPQ,...,SLXYBBG,KHZNEZF,318.0,0.0,QMOULXS,IRUDRFB,3.0,WNSPQGR,0.0,YXVIFAG
99996,TRAIN_00099996,0,JCDXFYU,PILDDJU,IAGJDOH,2.0,LFPUEOV,73,YVEGUNH,FTPHMPQ,...,EFSXYXY,KHZNEZF,7.0,0.0,QMOULXS,IRUDRFB,1.0,ZVSTLNM,0.0,KRLTMNT
99997,TRAIN_00099997,0,LLKAVMO,EKJSVRG,,3.0,YKHABYT,0,ZGTXJTG,LPYPUNA,...,NZGEZLW,GTISJWW,3960.0,0.0,ABEHJLN,IRUDRFB,8.0,NOTFWKW,0.0,WSHFVYH
99998,TRAIN_00099998,0,JCDXFYU,PILDDJU,IAGJDOH,16.0,LFPUEOV,243,UVDZYOW,VAWXMCR,...,NZGEZLW,KHZNEZF,2884.0,0.0,QMOULXS,IRUDRFB,8.0,ZVSTLNM,0.0,BBWWNVH


In [4]:
# train_df = pd.read_csv('train.csv')
# train_df.head()

# Neural Network PreProcess

In [5]:
# handle missing data (data imputation)
# Imputation strategies
# 1. if column is string, fill up with mode value (NOPE!)
# 1. if column is string, fill up with "UNKNOWN"
# 2. else column is numerical, fill up with median values
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        train_df[col] = train_df[col].fillna('UNKNOWN')
    else:
        train_df[col] = train_df[col].fillna(train_df[col].median())

In [6]:
# exclude true label Click 
numerical_features = train_df.select_dtypes(include=['int64', 'float64', 'int32', 'int8', 'int16']).columns.drop('Click')
# exclude unique label, ID
categorical_features = train_df.select_dtypes(include=['object']).columns.drop('ID')

categorical_features, numerical_features

(Index(['F01', 'F02', 'F03', 'F05', 'F07', 'F08', 'F09', 'F10', 'F12', 'F13',
        'F15', 'F16', 'F17', 'F20', 'F21', 'F22', 'F23', 'F25', 'F26', 'F28',
        'F30', 'F31', 'F34', 'F35', 'F37', 'F39'],
       dtype='object'),
 Index(['F04', 'F06', 'F11', 'F14', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32',
        'F33', 'F36', 'F38'],
       dtype='object'))

In [7]:
# Scale numerical features
scaler = StandardScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])

In [8]:
numerical_features

Index(['F04', 'F06', 'F11', 'F14', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32',
       'F33', 'F36', 'F38'],
      dtype='object')

In [9]:
# Encode categorical features
for feature in categorical_features:
    train_df[feature] = train_df[feature].astype('category').cat.codes


In [10]:
train_df

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_00000000,1,14174,281,4,0.294828,9188,-0.289449,8322,13,...,4179,4,-0.275644,0.077961,783,0,-0.251444,2330,-0.08282,132
1,TRAIN_00000001,0,21909,4322,24,-0.013410,14026,-0.176483,4598,14,...,4179,4,-0.274516,-0.064013,3224,0,0.702442,100,-0.08282,2060
2,TRAIN_00000002,0,9424,5678,12,0.312342,11878,-0.292138,2848,17,...,6426,7,-0.278044,-0.205988,14593,0,0.002925,3183,-0.08282,2137
3,TRAIN_00000003,1,16199,9579,35,-0.051939,812,-0.222207,13342,30,...,2675,4,-0.167323,-0.205988,7893,0,0.384480,3076,-0.08282,1074
4,TRAIN_00000004,0,19025,6125,35,-0.058945,2428,-0.238345,8322,14,...,4179,13,0.267194,-0.205988,1099,0,0.320887,1991,-0.08282,2962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,TRAIN_00099995,0,9424,5678,12,-0.100977,11878,-0.286759,11298,4,...,5542,7,-0.276457,-0.205988,14593,0,-0.315037,2739,-0.08282,4397
99996,TRAIN_00099996,0,9424,5678,12,-0.097475,11878,-0.095794,13169,4,...,1241,7,-0.280535,-0.205988,14593,0,-0.442222,3183,-0.08282,1874
99997,TRAIN_00099997,0,11887,1562,35,-0.093972,26028,-0.292138,13400,13,...,4179,4,-0.228696,-0.205988,36,0,0.002925,1660,-0.08282,4030
99998,TRAIN_00099998,0,9424,5678,12,-0.048437,11878,0.361447,11064,35,...,4179,7,-0.242806,-0.205988,14593,0,0.002925,3183,-0.08282,182


In [11]:
# Train 0.8 / Validate 0.2 split

X = train_df.drop(columns=['Click', 'ID'])
y = train_df['Click']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)

In [12]:
categorical_features

Index(['F01', 'F02', 'F03', 'F05', 'F07', 'F08', 'F09', 'F10', 'F12', 'F13',
       'F15', 'F16', 'F17', 'F20', 'F21', 'F22', 'F23', 'F25', 'F26', 'F28',
       'F30', 'F31', 'F34', 'F35', 'F37', 'F39'],
      dtype='object')

In [13]:
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [14]:
# Convert data to PyTorch tensors
X_train_cat = torch.tensor(X_train[categorical_features].values, dtype=torch.long).to(device)
X_train_num = torch.tensor(X_train[numerical_features].values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)

X_val_cat = torch.tensor(X_val[categorical_features].values, dtype=torch.long).to(device)
X_val_num = torch.tensor(X_val[numerical_features].values, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1).to(device)

In [15]:
# DataLoader
train_dataset = TensorDataset(X_train_cat, X_train_num, y_train_tensor)
val_dataset = TensorDataset(X_val_cat, X_val_num, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=4096, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4096, shuffle=False)

# implement neural network using pytorch

In [16]:
class GateCorssLayer(nn.Module):
    def __init__(self, input_dim, cn_layers=3):
        super(GateCorssLayer, self).__init__()
        self.cn_layers = cn_layers
        self.w = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for _ in range(cn_layers)])
        self.wg = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for _ in range(cn_layers)])
        self.b = nn.ParameterList([nn.Parameter(torch.zeros((input_dim,))) for _ in range(cn_layers)])
        for i in range(cn_layers):
            nn.init.uniform_(self.b[i].data)
        self.activation = nn.Sigmoid()

    def forward(self, x):
        x0 = x
        for i in range(self.cn_layers):
            xw = self.w[i](x)
            xg = self.activation(self.wg[i](x))
            x = x0 * (xw + self.b[i]) * xg + x
        return x

In [17]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, embed_dims, dropout=0.5, output_layer=True):
        super().__init__()
        layers = list()
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim

        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)
        self._init_weight_()

    def _init_weight_(self):
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, x):
        return self.mlp(x)

In [18]:
# Embedding Layer
class FeaturesEmbedding1(torch.nn.Module):
    def __init__(self, field_dims, embed_dim, initializer="xavier"):
        super().__init__()
        self.field_dims = field_dims
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight)

    def forward(self, x):
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)

In [19]:
class GDCN(nn.Module):
    def __init__(self, field_dims, embed_dim, cn_layers=3, mlp_layers=(400, 400, 400), dropout=0.5):
        super(GDCN, self).__init__()
        self.embedding = FeaturesEmbedding1(field_dims, embed_dim)
        self.embed_output_dim = len(field_dims) * embed_dim
        self.cross_net = GateCorssLayer(self.embed_output_dim + len(numerical_features), cn_layers)
        self.mlp = MultiLayerPerceptron(self.embed_output_dim + len(numerical_features), mlp_layers, output_layer=False, dropout=dropout)
        self.fc = nn.Linear(mlp_layers[-1] + self.embed_output_dim + len(numerical_features), 1)

    def forward(self, cat_data, num_data):
        x_cat = self.embedding(cat_data)
        x_cat = x_cat.view(x_cat.size(0), -1)  # Ensure x_cat is reshaped to (batch_size, embed_output_dim)
        #print(f"x_cat shape: {x_cat.shape}")  # Debugging shape
        x = torch.cat([x_cat, num_data], dim=1)
        #print(f"x shape after concat: {x.shape}")  # Debugging shape
        cross_cn = self.cross_net(x)
        #print(f"cross_cn shape: {cross_cn.shape}")  # Debugging shape
        cross_mlp = self.mlp(x)
        #print(f"cross_mlp shape: {cross_mlp.shape}")  # Debugging shape
        pred_y = self.fc(torch.cat([cross_cn, cross_mlp], dim=1))
        #print(f"pred_y shape: {pred_y.shape}")  # Debugging shape
        return pred_y

# Debugging shapes before the model
for cat_inputs, num_inputs, targets in train_loader:
    cat_inputs = cat_inputs.to(device)
    num_inputs = num_inputs.to(device)
    print(f"cat_inputs shape: {cat_inputs.shape}")  # Debugging shape
    print(f"num_inputs shape: {num_inputs.shape}")  # Debugging shape
    break

cat_inputs shape: torch.Size([4096, 26])
num_inputs shape: torch.Size([4096, 13])


In [20]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0.001):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss

In [21]:
# Initialize model, loss function, optimizer, scheduler, and early stopping
field_dims = [train_df[col].nunique() for col in categorical_features]
model = GDCN(field_dims=field_dims, embed_dim=16, cn_layers=3).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=3, verbose=True)
early_stopping = EarlyStopping(patience=5, delta=0.001)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)


In [22]:
# Evaluate the model
def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    all_outputs = []
    all_targets = []
    with torch.no_grad():
        for cat_inputs, num_inputs, targets in val_loader:
            raw_outputs = model(cat_inputs, num_inputs)
            outputs = torch.sigmoid(raw_outputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            all_outputs.extend(outputs.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    val_loss /= len(val_loader)
    val_roc_auc = roc_auc_score(all_targets, all_outputs)
    return val_loss, val_roc_auc

In [23]:
# Debugging shapes before the model
for cat_inputs, num_inputs, targets in train_loader:
    cat_inputs = cat_inputs.to(device)
    num_inputs = num_inputs.to(device)
    print(f"cat_inputs shape: {cat_inputs.shape}")  # Debugging shape
    print(f"num_inputs shape: {num_inputs.shape}")  # Debugging shape
    break

cat_inputs shape: torch.Size([4096, 26])
num_inputs shape: torch.Size([4096, 13])


In [None]:
# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for cat_inputs, num_inputs, targets in train_loader:
        optimizer.zero_grad()
        raw_outputs = model(cat_inputs, num_inputs)
        outputs = torch.sigmoid(raw_outputs)
#         print(f"outputs shape: {outputs.shape}")  # Debugging shape
#         print(f"outputs range: min={outputs.min().item()}, max={outputs.max().item()}")  # Debugging range
#         print(f"targets shape: {targets.shape}")  # Debugging shape
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    val_loss, val_roc_auc = evaluate_model(model, val_loader, criterion)
    scheduler.step(val_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss:.4f}, Validation ROC-AUC: {val_roc_auc:.4f}")
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))


In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    y_pred_probs = model(X_val_cat, X_val_num)
    y_pred = (y_pred_probs > 0.5).float()
    accuracy = (y_pred.eq(y_val_tensor).sum() / y_val_tensor.shape[0]).item()
    print(f'Accuracy: {accuracy * 100:.2f}%')
    val_roc_auc = roc_auc_score(y_val_tensor.cpu(), y_pred_probs.cpu())
    print(f'Validation ROC-AUC: {val_roc_auc:.4f}')

In [None]:
raise 

# TEST dataset

In [None]:
test_df = pd.read_csv('test.csv')

In [None]:
test_df_id = test_df['ID']
test_df = test_df.drop(['ID'], axis = 1)

In [None]:
# Data Preprocessing - imputation
for col in test_df.columns:
    if test_df[col].dtype == 'object':
        test_df[col] = test_df[col].fillna('UNKNOWN')
    else:
        test_df[col] = test_df[col].fillna(test_df[col].median())
        
for feature in categorical_features:
    test_df[feature] = test_df[feature].astype('category').cat.codes


In [None]:
# exclude true label Click 
numerical_features = train_df.select_dtypes(include=['int64', 'float64', 'int32', 'int8', 'int16']).columns.drop('Click')
# exclude unique label, ID
categorical_features = train_df.select_dtypes(include=['object']).columns.drop('ID')

categorical_features, numerical_features

In [None]:
test_df_id

In [None]:
# Scale the numerical features using the same scaler fitted on the train data
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

# Define categorical and numerical features for the test set
X_test_cat = test_df[categorical_features].values
X_test_num = test_df[numerical_features].values

# Convert to PyTorch tensors and move to GPU
X_test_cat_tensor = torch.tensor(X_test_cat, dtype=torch.long).to(device)
X_test_num_tensor = torch.tensor(X_test_num, dtype=torch.float32).to(device)


In [None]:
# DataLoader
test_dataset = TensorDataset(X_test_cat_tensor, X_test_num_tensor)

test_loader = DataLoader(test_dataset, batch_size=4096, shuffle=False)

In [None]:
test_loader.dataset[0]

In [None]:
# Make predictions on test data
model.eval()
all_outputs = []
with torch.no_grad():
    for cat_inputs, num_inputs in test_loader:
        print(cat_inputs.shape, num_inputs.shape)
        raw_outputs = model(cat_inputs, num_inputs)
        outputs = torch.sigmoid(raw_outputs)
        print(outputs)
        all_outputs.extend(outputs.cpu().numpy())
        
test_preds = np.concatenate(all_outputs, axis=0)


# Create submission file
submission_df = pd.DataFrame({'ID': test_df_id, 'Click': test_preds.flatten()})
submission_df

In [None]:
# Output value counts and save to CSV
submission_df['Click'].value_counts(), submission_df['Click'].mean()

In [None]:
submission_df.to_csv('submission1.csv', index=False)