### Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import os
import glob
import math

from shutil import rmtree

In [2]:
tf.random.set_seed(2023)
import random
random.seed(2023)
np.random.seed(2023)

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [4]:
tf.config.list_physical_devices('GPU')

[]

### Clean folder

In [5]:
chckpt_path = './models/'
results_path = './results/'
if os.path.exists(chckpt_path):
    rmtree(chckpt_path)
if os.path.exists(results_path):
    rmtree(results_path)

### Load data

In [6]:
path = "data/train/"
all_files = glob.glob(os.path.join(path, "*.csv"))

data = pd.concat((pd.read_csv(f, sep="\t") for f in all_files), ignore_index=True)

In [7]:
train_data, val_data = train_test_split(data, test_size=0.25)

### Preprocess data

In [8]:
cat_train = train_data.iloc[:,2:33]
del cat_train["f_7"] #Only one value --> useless
bin_train = train_data.iloc[:,33:42]
num_train = train_data.iloc[:,42:80]
labels_train = train_data.iloc[:,80:82]

cat_val = val_data.iloc[:,2:33]
del cat_val["f_7"] #Only one value --> useless
bin_val = val_data.iloc[:,33:42]
num_val = val_data.iloc[:,42:80]
labels_val = val_data.iloc[:,80:82]

In [9]:
#num_train.max()

In [10]:
# Categorical variables
cat_train_selected = cat_train.to_numpy()
cat_val_selected = cat_val.to_numpy()
# cat_test_selected = cat_test.to_numpy()

# Numerical variables : estimate missing values and normalize 
imputer = IterativeImputer(max_iter=10, random_state=0)
num_train_selected = imputer.fit_transform(num_train)
scaler = MinMaxScaler()
num_train_selected = scaler.fit_transform(num_train_selected)

num_val_selected = scaler.transform(imputer.transform(num_val))
# num_test_selected = scaler.transform(imputer.transform(num_test))

# Binary variables
bin_train_selected = bin_train.to_numpy()
bin_val_selected = bin_val.to_numpy()
# bin_test_selected = bin_test.to_numpy()

# Output variables
y_train = labels_train
y_val = labels_val
# y_is_clicked = y.iloc[:,0]
y_train_is_installed = y_train.iloc[:,1]
y_val_is_installed = y_val.iloc[:,1]

In [11]:
for col_ind in range(cat_train_selected.shape[1]):

    unique_values = np.unique(cat_train_selected[:, col_ind][~np.isnan(cat_train_selected[:,col_ind])]).astype(int)
    # test_unique_values = np.unique(cat_test_selected[:, col_ind]).astype(int)

    # Make categorical variables from 1 to n (n corresponding to the number of unique values for the corresponding categorical feature)
    replacement_dict = dict()
    for index, val in enumerate(unique_values):
        index+=1
        replacement_dict[val] = index

    # Process training data (categorical)
    for line_ind in range(len(cat_train_selected[:,col_ind])):
        if math.isnan(cat_train_selected[line_ind, col_ind]): # 0 used for missing values
            cat_train_selected[line_ind, col_ind] = 0
        else:
            cat_train_selected[line_ind, col_ind] = replacement_dict[int(cat_train_selected[line_ind, col_ind])] # Use the new value (from 1 to n)

    # Process validation data (categorical)
    for line_ind in range(len(cat_val_selected[:,col_ind])):
        try:
            if math.isnan(cat_val_selected[line_ind, col_ind]):
                cat_val_selected[line_ind, col_ind] = 0 # 0 used for missing values
            else:
                cat_val_selected[line_ind, col_ind] = replacement_dict[int(cat_val_selected[line_ind, col_ind])] # Use the new value (from 1 to n)
        except KeyError:
            cat_val_selected[line_ind, col_ind] = 0 # If the value was not in the training data, treat as a missing value (because we can't train on it)
    
    # # Process test data (categorical)
    # for line_ind in range(len(cat_test_selected[:,col_ind])):
    #     try:
    #         if math.isnan(cat_test_selected[line_ind, col_ind]):
    #             cat_test_selected[line_ind, col_ind] = 0 # 0 used for missing values
    #         else:
    #             cat_test_selected[line_ind, col_ind] = replacement_dict[int(cat_test_selected[line_ind, col_ind])] # Use the new value (from 1 to n)
    #     except KeyError:
    #         cat_test_selected[line_ind, col_ind] = 0 # If the value was not in the training data, treat as a missing value (because we can't train on it)

cat_train_selected = cat_train_selected.astype(int)
cat_val_selected = cat_val_selected.astype(int)
# cat_test_selected = cat_test_selected.astype(int)

In [12]:
# Compute bias to help the model with imbalanced dataset
neg, pos = np.bincount(y_train_is_installed) 
total = neg + pos 
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format( total, pos, 100 * pos / total)) 
initial_bias = np.log([pos/neg]) 
print(initial_bias)

Examples:
    Total: 2614389
    Positive: 455360 (17.42% of total)

[-1.55632555]


In [13]:
# import tensorflow.keras.backend as K
# tp = tf.keras.metrics.TruePositives()
# fp = tf.keras.metrics.FalsePositives()

# def pos(y_true, y_pred):
#     fp = tf.keras.metrics.FalsePositives()
#     fp.update_state(bigy[test], pred)
#     fp = fp.result().numpy()

#     fn = tf.keras.metrics.FalseNegatives()
#     fn.update_state(bigy[test], pred)
#     fn = fn.result().numpy()

#     tn = tf.keras.metrics.TrueNegatives()
#     tn.update_state(bigy[test], pred)
#     tn = tn.result().numpy()

# def neg():
#     return tf.keras.metrics.TrueNegatives(y_true, y_pred) + tf.keras.metrics.FalseNegatives(y_true, y_pred)

# # def neg(y_true, y_pred):
# #     return K.sum(y_true[y_true==0], axis=0)

# def fpr(y_true, y_pred):
#     return K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0) #/ pos(y_true, y_pred)

# def fnr(y_true, y_pred):
#     return K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0) / neg(y_true, y_pred)

# def tpr(y_true, y_pred):
#     return K.sum(K.cast(y_true*y_pred, 'float'), axis=0) / pos(y_true, y_pred)

# def tnr(y_true, y_pred):
#     return K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0) / neg(y_true, y_pred)

In [14]:
embed_size = 256

# Inputs
cat_input_layer = layers.Input(shape=(cat_train_selected.shape[1],), dtype=tf.int64)
bin_input_layer = layers.Input(shape=(bin_train_selected.shape[1],), dtype=tf.int64)
num_input_layer = layers.Input(shape=(num_train_selected.shape[1],), dtype=tf.float64)

embedding_layers = []
for i in range(cat_train_selected.shape[1]):
    num_values = len(set(cat_train_selected[:,i]))
    if num_values <= embed_size:
        embedding_layers.append(layers.Embedding(input_dim=num_values+1, output_dim=num_values, input_length=1, mask_zero=True)(cat_input_layer[:,i]))
    else:
        embedding_layers.append(layers.Embedding(input_dim=num_values+1, output_dim=embed_size, input_length=1, mask_zero=True)(cat_input_layer[:,i]))

bin_dense_layer = layers.Dense(64, activation='relu')(bin_input_layer)
num_dense_layer = layers.Dense(64, activation='relu')(num_input_layer)

# Concat all inputs
concatted = tf.keras.layers.Concatenate()([bin_dense_layer, num_dense_layer, *embedding_layers])

# Hidden layers
hidden_layer_1 = layers.Dense(500, activation='relu')(concatted)
hidden_layer_2 = layers.Dense(250, activation='relu')(hidden_layer_1)
hidden_layer_3 = layers.Dense(50, activation='relu')(hidden_layer_2)
hidden_layer_4 = layers.Dense(100, activation='relu')(hidden_layer_3)
hidden_layer_5 = layers.Dense(40, activation='relu')(hidden_layer_4)

# Outputs
output_bias = tf.keras.initializers.Constant(initial_bias)

#output_1 = layers.Dense(1, activation='sigmoid', name="is_clicked", bias_initializer=output_bias)(hidden_layer_5) # If we want to predict "is_clicked", use this output (give two outputs to the model instead of one).
output_2 = layers.Dense(1, activation="sigmoid", name="is_installed", bias_initializer=output_bias)(hidden_layer_5)

# Create model
model = keras.Model(inputs=[cat_input_layer, bin_input_layer, num_input_layer], outputs=[output_2])
# model = keras.Model(inputs=[cat_input_layer, bin_input_layer, num_input_layer], outputs=[output_1, output_2]) # If we want to predict "is_clicked" and "is_installed"

# Compile model
batch_size = 5000 
learning_rate=0.001
optimizer = keras.optimizers.Adam(lr=learning_rate)

if len(model.outputs)>1:
    monitor_name = 'val_is_installed_loss'
else:
    monitor_name = "val_loss"
early_stopping = tf.keras.callbacks.EarlyStopping(monitor=monitor_name, patience=3)
# mcp_save = tf.keras.callbacks.ModelCheckpoint(filepath= chckpt_path + '{epoch:04d}', save_best_only=True, save_weights_only=False, monitor='val_loss', mode='min', save_freq='epoch')
mcp_save = tf.keras.callbacks.ModelCheckpoint(filepath= chckpt_path + '{epoch:04d}', save_best_only=False, save_weights_only=False, save_freq='epoch')

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])  



In [15]:
# tf.keras.utils.plot_model(model, "model.png")

In [16]:
# model.fit((cat_train_selected, bin_train_selected, num_train_selected), y_train_is_installed, epochs = 50, batch_size=batch_size, 
#           validation_data=((cat_val_selected, bin_val_selected, num_val_selected), y_val_is_installed),
#           callbacks=[early_stopping, mcp_save])

model.fit((cat_train_selected, bin_train_selected, num_train_selected), y_train_is_installed, epochs = 50, batch_size=batch_size, 
          validation_data=((cat_val_selected, bin_val_selected, num_val_selected), y_val_is_installed),
          callbacks=[mcp_save])

Epoch 1/50


INFO:tensorflow:Assets written to: ./models/0001/assets


Epoch 2/50


INFO:tensorflow:Assets written to: ./models/0002/assets


Epoch 3/50


INFO:tensorflow:Assets written to: ./models/0003/assets


Epoch 4/50


INFO:tensorflow:Assets written to: ./models/0004/assets


Epoch 5/50


INFO:tensorflow:Assets written to: ./models/0005/assets


Epoch 6/50


INFO:tensorflow:Assets written to: ./models/0006/assets


Epoch 7/50


INFO:tensorflow:Assets written to: ./models/0007/assets


Epoch 8/50


INFO:tensorflow:Assets written to: ./models/0008/assets


Epoch 9/50


INFO:tensorflow:Assets written to: ./models/0009/assets


Epoch 10/50


INFO:tensorflow:Assets written to: ./models/0010/assets


Epoch 11/50


INFO:tensorflow:Assets written to: ./models/0011/assets


Epoch 12/50


INFO:tensorflow:Assets written to: ./models/0012/assets


Epoch 13/50


INFO:tensorflow:Assets written to: ./models/0013/assets


Epoch 14/50


INFO:tensorflow:Assets written to: ./models/0014/assets


Epoch 15/50


INFO:tensorflow:Assets written to: ./models/0015/assets


Epoch 16/50


INFO:tensorflow:Assets written to: ./models/0016/assets


Epoch 17/50


INFO:tensorflow:Assets written to: ./models/0017/assets


Epoch 18/50


INFO:tensorflow:Assets written to: ./models/0018/assets


Epoch 19/50


INFO:tensorflow:Assets written to: ./models/0019/assets


Epoch 20/50


INFO:tensorflow:Assets written to: ./models/0020/assets


Epoch 21/50


INFO:tensorflow:Assets written to: ./models/0021/assets


Epoch 22/50


INFO:tensorflow:Assets written to: ./models/0022/assets


Epoch 23/50


INFO:tensorflow:Assets written to: ./models/0023/assets


Epoch 24/50


INFO:tensorflow:Assets written to: ./models/0024/assets


Epoch 25/50


INFO:tensorflow:Assets written to: ./models/0025/assets


Epoch 26/50


INFO:tensorflow:Assets written to: ./models/0026/assets


Epoch 27/50


INFO:tensorflow:Assets written to: ./models/0027/assets


Epoch 28/50


INFO:tensorflow:Assets written to: ./models/0028/assets


Epoch 29/50


INFO:tensorflow:Assets written to: ./models/0029/assets


Epoch 30/50


INFO:tensorflow:Assets written to: ./models/0030/assets


Epoch 31/50


INFO:tensorflow:Assets written to: ./models/0031/assets


Epoch 32/50


INFO:tensorflow:Assets written to: ./models/0032/assets


Epoch 33/50


INFO:tensorflow:Assets written to: ./models/0033/assets


Epoch 34/50


INFO:tensorflow:Assets written to: ./models/0034/assets


Epoch 35/50


INFO:tensorflow:Assets written to: ./models/0035/assets


Epoch 36/50


INFO:tensorflow:Assets written to: ./models/0036/assets


Epoch 37/50


INFO:tensorflow:Assets written to: ./models/0037/assets


Epoch 38/50


INFO:tensorflow:Assets written to: ./models/0038/assets


Epoch 39/50


INFO:tensorflow:Assets written to: ./models/0039/assets


Epoch 40/50


INFO:tensorflow:Assets written to: ./models/0040/assets


Epoch 41/50


INFO:tensorflow:Assets written to: ./models/0041/assets


Epoch 42/50


INFO:tensorflow:Assets written to: ./models/0042/assets


Epoch 43/50


INFO:tensorflow:Assets written to: ./models/0043/assets


Epoch 44/50


INFO:tensorflow:Assets written to: ./models/0044/assets


Epoch 45/50


INFO:tensorflow:Assets written to: ./models/0045/assets


Epoch 46/50


INFO:tensorflow:Assets written to: ./models/0046/assets


Epoch 47/50


INFO:tensorflow:Assets written to: ./models/0047/assets


Epoch 48/50


INFO:tensorflow:Assets written to: ./models/0048/assets


Epoch 49/50


INFO:tensorflow:Assets written to: ./models/0049/assets


Epoch 50/50


INFO:tensorflow:Assets written to: ./models/0050/assets




<keras.src.callbacks.History at 0x34f3bca60>

In [33]:
list_of_models= glob.glob(chckpt_path+"/*") # * means all if need specific format then *.csv
latest_model = max(list_of_models, key=os.path.getctime)
# print(latest_file)
# model = tf.keras.models.load_model(latest_model)
model = tf.keras.models.load_model("./models/0010/")

In [34]:
def compute_metrics(y_true, y_pred):
    y_pred[y_pred>=0.5]=1
    y_pred[y_pred<0.5]=0

    cm = confusion_matrix(y_true, y_pred)

    tn, fp, fn, tp = cm.ravel()

    tpr = round(tp / (tp+fn),4)
    fpr = round(tp / (fp+tn),4)
    tnr = round(tn / (tn+fp),4)
    fnr = round(fn / (fn+tp),4)
    acc = round((tp + tn) / (tp+fn+fp+tn),4)
    precision = round(tp / (tp + fp),4)
    f1 = round(2 * (precision * tpr) / (precision + tpr),4)
    
    return tpr, fpr, tnr, fnr, acc, precision, f1

In [37]:
# Val predictions
y_pred_val = model.predict((cat_val_selected, bin_val_selected, num_val_selected), batch_size=batch_size)
tpr_val, fpr_val, tnr_val, fnr_val, acc_val, precision_val, f1_val = compute_metrics(y_val_is_installed, y_pred_val)


# Val dumb predictions
y_pred_val_dumb = np.zeros(y_val_is_installed.shape)
tpr_val_dumb, fpr_val_dumb, tnr_val_dumb, fnr_val_dumb, acc_val_dumb, precision_val_dumb, f1_val_dumb = compute_metrics(y_val_is_installed, y_pred_val_dumb)

# Train predictions
y_pred_train = model.predict((cat_train_selected, bin_train_selected, num_train_selected), batch_size=batch_size)
tpr_train, fpr_train, tnr_train, fnr_train, acc_train, precision_train, f1_train = compute_metrics(y_train_is_installed, y_pred_train)

# Train dumb predictions
y_pred_train_dumb = np.zeros(y_train_is_installed.shape)
tpr_train_dumb, fpr_train_dumb, tnr_train_dumb, fnr_train_dumb, acc_train_dumb, precision_train_dumb, f1_train_dumb = compute_metrics(y_train_is_installed, y_pred_train_dumb)



  precision = round(tp / (tp + fp),4)




  precision = round(tp / (tp + fp),4)


In [40]:
tpr_val, fpr_val, tnr_val, fnr_val, acc_val, precision_val, f1_val, acc_val_dumb

(0.5679, 0.1193, 0.8507, 0.4321, 0.8016, 0.4441, 0.4984, 0.8265)

In [41]:
tpr_train, fpr_train, tnr_train, fnr_train, acc_train, precision_train, f1_train, acc_train_dumb

(0.6626, 0.1398, 0.8668, 0.3374, 0.8313, 0.5121, 0.5777, 0.8258)