### Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import os
import glob
import math

from shutil import rmtree

In [2]:
tf.random.set_seed(2023)
import random
random.seed(2023)
np.random.seed(2023)

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [4]:
tf.config.list_physical_devices('GPU')

[]

### Clean folder

In [5]:
chckpt_path = './models/'
results_path = './results/'
if os.path.exists(chckpt_path):
    rmtree(chckpt_path)
if os.path.exists(results_path):
    rmtree(results_path)

### Load data

In [6]:
path = "../data/train/"
all_files = glob.glob(os.path.join(path, "*.csv"))

data = pd.concat((pd.read_csv(f, sep="\t") for f in all_files), ignore_index=True)

In [7]:
train_data = data

### Preprocess data

In [8]:
cat_train = train_data.iloc[:,2:33]
del cat_train["f_7"] #Only one value --> useless
bin_train = train_data.iloc[:,33:42]
num_train = train_data.iloc[:,42:80]
labels_train = train_data.iloc[:,80:82]

In [9]:
# Categorical variables
cat_train_selected = cat_train.to_numpy()

# Numerical variables : estimate missing values and normalize 
imputer = IterativeImputer(max_iter=10, random_state=0)
num_train_selected = imputer.fit_transform(num_train)
scaler = MinMaxScaler()
num_train_selected = scaler.fit_transform(num_train_selected)

# Binary variables
bin_train_selected = bin_train.to_numpy()

# Output variables
y_train = labels_train
y_train_is_installed = y_train.iloc[:,1]

In [10]:
for col_ind in range(cat_train_selected.shape[1]):

    unique_values = np.unique(cat_train_selected[:, col_ind][~np.isnan(cat_train_selected[:,col_ind])]).astype(int)

    # Make categorical variables from 1 to n (n corresponding to the number of unique values for the corresponding categorical feature)
    replacement_dict = dict()
    for index, val in enumerate(unique_values):
        index+=1
        replacement_dict[val] = index

    # Process training data (categorical)
    for line_ind in range(len(cat_train_selected[:,col_ind])):
        if math.isnan(cat_train_selected[line_ind, col_ind]): # 0 used for missing values
            cat_train_selected[line_ind, col_ind] = 0
        else:
            cat_train_selected[line_ind, col_ind] = replacement_dict[int(cat_train_selected[line_ind, col_ind])] # Use the new value (from 1 to n)

cat_train_selected = cat_train_selected.astype(int)

In [11]:
# Compute bias to help the model with imbalanced dataset
neg, pos = np.bincount(y_train_is_installed) 
total = neg + pos 
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format( total, pos, 100 * pos / total)) 
initial_bias = np.log([pos/neg]) 
print(initial_bias)

Examples:
    Total: 3485852
    Positive: 606602 (17.40% of total)

[-1.55741223]


In [12]:
embed_size = 256

# Inputs
cat_input_layer = layers.Input(shape=(cat_train_selected.shape[1],), dtype=tf.int64)
bin_input_layer = layers.Input(shape=(bin_train_selected.shape[1],), dtype=tf.int64)
num_input_layer = layers.Input(shape=(num_train_selected.shape[1],), dtype=tf.float64)

embedding_layers = []
for i in range(cat_train_selected.shape[1]):
    num_values = len(set(cat_train_selected[:,i]))
    if num_values <= embed_size:
        embedding_layers.append(layers.Embedding(input_dim=num_values+1, output_dim=num_values, input_length=1, mask_zero=True)(cat_input_layer[:,i]))
    else:
        embedding_layers.append(layers.Embedding(input_dim=num_values+1, output_dim=embed_size, input_length=1, mask_zero=True)(cat_input_layer[:,i]))

bin_dense_layer = layers.Dense(64, activation='relu')(bin_input_layer)
num_dense_layer = layers.Dense(64, activation='relu')(num_input_layer)

# Concat all inputs
concatted = tf.keras.layers.Concatenate()([bin_dense_layer, num_dense_layer, *embedding_layers])

# Hidden layers
hidden_layer_1 = layers.Dense(500, activation='relu')(concatted)
hidden_layer_2 = layers.Dense(250, activation='relu')(hidden_layer_1)
hidden_layer_3 = layers.Dense(50, activation='relu')(hidden_layer_2)
hidden_layer_4 = layers.Dense(100, activation='relu')(hidden_layer_3)
hidden_layer_5 = layers.Dense(40, activation='relu')(hidden_layer_4)

# Outputs
output_bias = tf.keras.initializers.Constant(initial_bias)

#output_1 = layers.Dense(1, activation='sigmoid', name="is_clicked", bias_initializer=output_bias)(hidden_layer_5) # If we want to predict "is_clicked", use this output (give two outputs to the model instead of one).
output_2 = layers.Dense(1, activation="sigmoid", name="is_installed", bias_initializer=output_bias)(hidden_layer_5)

# Create model
model = keras.Model(inputs=[cat_input_layer, bin_input_layer, num_input_layer], outputs=[output_2])
# model = keras.Model(inputs=[cat_input_layer, bin_input_layer, num_input_layer], outputs=[output_1, output_2]) # If we want to predict "is_clicked" and "is_installed"

# Compile model
batch_size = 5000 
learning_rate=0.001
optimizer = keras.optimizers.Adam(lr=learning_rate)

if len(model.outputs)>1:
    monitor_name = 'val_is_installed_loss'
else:
    monitor_name = "val_loss"
# early_stopping = tf.keras.callbacks.EarlyStopping(monitor=monitor_name, patience=3)
mcp_save = tf.keras.callbacks.ModelCheckpoint(filepath= chckpt_path + '{epoch:04d}', save_best_only=False, save_weights_only=False, save_freq='epoch')

acc = tf.metrics.BinaryAccuracy(threshold=0.5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', acc])  



In [13]:
model.fit((cat_train_selected, bin_train_selected, num_train_selected), y_train_is_installed, epochs = 3, batch_size=batch_size, callbacks=[mcp_save])

Epoch 1/3


INFO:tensorflow:Assets written to: ./models/0001/assets


Epoch 2/3


INFO:tensorflow:Assets written to: ./models/0002/assets


Epoch 3/3


INFO:tensorflow:Assets written to: ./models/0003/assets




<keras.src.callbacks.History at 0x117770250>

In [14]:
list_of_models= glob.glob(chckpt_path+"/*") 
latest_model = max(list_of_models, key=os.path.getctime)
# print(latest_file)
model = tf.keras.models.load_model(latest_model)

In [15]:
def compute_metrics(y_true, y_pred):
    y_pred[y_pred>=0.5]=1
    y_pred[y_pred<0.5]=0

    cm = confusion_matrix(y_true, y_pred)

    tn, fp, fn, tp = cm.ravel()

    tpr = round(tp / (tp+fn),4)
    fpr = round(tp / (fp+tn),4)
    tnr = round(tn / (tn+fp),4)
    fnr = round(fn / (fn+tp),4)
    acc = round((tp + tn) / (tp+fn+fp+tn),4)
    precision = round(tp / (tp + fp),4)
    f1 = round(2 * (precision * tpr) / (precision + tpr),4)
    
    return tpr, fpr, tnr, fnr, acc, precision, f1

In [16]:
# Train predictions
y_pred_train = model.predict((cat_train_selected, bin_train_selected, num_train_selected), batch_size=batch_size)
tpr_train, fpr_train, tnr_train, fnr_train, acc_train, precision_train, f1_train = compute_metrics(y_train_is_installed, y_pred_train)

# Train dumb predictions
y_pred_train_dumb = np.zeros(y_train_is_installed.shape)
tpr_train_dumb, fpr_train_dumb, tnr_train_dumb, fnr_train_dumb, acc_train_dumb, precision_train_dumb, f1_train_dumb = compute_metrics(y_train_is_installed, y_pred_train_dumb)



  precision = round(tp / (tp + fp),4)


In [17]:
tpr_train, fpr_train, tnr_train, fnr_train, acc_train, precision_train, f1_train, acc_train_dumb

(0.5318, 0.112, 0.8888, 0.4682, 0.8266, 0.5018, 0.5164, 0.826)