### Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import os
import glob
import math

from shutil import rmtree

In [2]:
tf.random.set_seed(2023)
import random
random.seed(2023)
np.random.seed(2023)

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [4]:
tf.config.list_physical_devices('GPU')

[]

### Clean folder

In [5]:
chckpt_path = './models/'
results_path = './results/'
if os.path.exists(chckpt_path):
    rmtree(chckpt_path)
if os.path.exists(results_path):
    rmtree(results_path)

### Load data

In [6]:
path = "data/train/"
all_files = glob.glob(os.path.join(path, "*.csv"))

data = pd.concat((pd.read_csv(f, sep="\t") for f in all_files), ignore_index=True)

In [7]:
train_data, val_data = train_test_split(data, test_size=0.25)

In [8]:
test_data = pd.read_csv("data/test/000000000000.csv", sep="\t")

### Preprocess data

In [9]:
cat_train = train_data.iloc[:,2:33]
del cat_train["f_7"] #Only one value --> useless
bin_train = train_data.iloc[:,33:42]
num_train = train_data.iloc[:,42:80]
labels_train = train_data.iloc[:,80:82]

cat_val = val_data.iloc[:,2:33]
del cat_val["f_7"] #Only one value --> useless
bin_val = val_data.iloc[:,33:42]
num_val = val_data.iloc[:,42:80]
labels_val = val_data.iloc[:,80:82]

cat_test = test_data.iloc[:,2:33]
del cat_test["f_7"] #Only one value --> useless
bin_test = test_data.iloc[:,33:42]
num_test = test_data.iloc[:,42:80]

In [10]:
#num_train.max()

In [11]:
# Categorical variables
cat_train_selected = cat_train.to_numpy()
cat_val_selected = cat_val.to_numpy()
cat_test_selected = cat_test.to_numpy()

# Numerical variables : estimate missing values and normalize 
imputer = IterativeImputer(max_iter=10, random_state=0)
num_train_selected = imputer.fit_transform(num_train)
scaler = MinMaxScaler()
num_train_selected = scaler.fit_transform(num_train_selected)

num_val_selected = scaler.transform(imputer.transform(num_val))
num_test_selected = scaler.transform(imputer.transform(num_test))

# Binary variables
bin_train_selected = bin_train.to_numpy()
bin_val_selected = bin_val.to_numpy()
bin_test_selected = bin_test.to_numpy()

# Output variables
y_train = labels_train
y_val = labels_val
# y_is_clicked = y.iloc[:,0]
y_train_is_installed = y_train.iloc[:,1]
y_val_is_installed = y_val.iloc[:,1]

In [12]:
for col_ind in range(cat_train_selected.shape[1]):

    unique_values = np.unique(cat_train_selected[:, col_ind][~np.isnan(cat_train_selected[:,col_ind])]).astype(int)
    # test_unique_values = np.unique(cat_test_selected[:, col_ind]).astype(int)

    # Make categorical variables from 1 to n (n corresponding to the number of unique values for the corresponding categorical feature)
    replacement_dict = dict()
    for index, val in enumerate(unique_values):
        index+=1
        replacement_dict[val] = index

    # Process training data (categorical)
    for line_ind in range(len(cat_train_selected[:,col_ind])):
        if math.isnan(cat_train_selected[line_ind, col_ind]): # 0 used for missing values
            cat_train_selected[line_ind, col_ind] = 0
        else:
            cat_train_selected[line_ind, col_ind] = replacement_dict[int(cat_train_selected[line_ind, col_ind])] # Use the new value (from 1 to n)

    # Process validation data (categorical)
    for line_ind in range(len(cat_val_selected[:,col_ind])):
        try:
            if math.isnan(cat_val_selected[line_ind, col_ind]):
                cat_val_selected[line_ind, col_ind] = 0 # 0 used for missing values
            else:
                cat_val_selected[line_ind, col_ind] = replacement_dict[int(cat_val_selected[line_ind, col_ind])] # Use the new value (from 1 to n)
        except KeyError:
            cat_val_selected[line_ind, col_ind] = 0 # If the value was not in the training data, treat as a missing value (because we can't train on it)
    
    # Process test data (categorical)
    for line_ind in range(len(cat_test_selected[:,col_ind])):
        try:
            if math.isnan(cat_test_selected[line_ind, col_ind]):
                cat_test_selected[line_ind, col_ind] = 0 # 0 used for missing values
            else:
                cat_test_selected[line_ind, col_ind] = replacement_dict[int(cat_test_selected[line_ind, col_ind])] # Use the new value (from 1 to n)
        except KeyError:
            cat_test_selected[line_ind, col_ind] = 0 # If the value was not in the training data, treat as a missing value (because we can't train on it)

cat_train_selected = cat_train_selected.astype(int)
cat_val_selected = cat_val_selected.astype(int)
cat_test_selected = cat_test_selected.astype(int)

In [13]:
# Compute bias to help the model with imbalanced dataset
neg, pos = np.bincount(y_train_is_installed) 
total = neg + pos 
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format( total, pos, 100 * pos / total)) 
initial_bias = np.log([pos/neg]) 
print(initial_bias)

Examples:
    Total: 2614389
    Positive: 455360 (17.42% of total)

[-1.55632555]


In [14]:
# import tensorflow.keras.backend as K
# tp = tf.keras.metrics.TruePositives()
# fp = tf.keras.metrics.FalsePositives()

# def pos(y_true, y_pred):
#     fp = tf.keras.metrics.FalsePositives()
#     fp.update_state(bigy[test], pred)
#     fp = fp.result().numpy()

#     fn = tf.keras.metrics.FalseNegatives()
#     fn.update_state(bigy[test], pred)
#     fn = fn.result().numpy()

#     tn = tf.keras.metrics.TrueNegatives()
#     tn.update_state(bigy[test], pred)
#     tn = tn.result().numpy()

# def neg():
#     return tf.keras.metrics.TrueNegatives(y_true, y_pred) + tf.keras.metrics.FalseNegatives(y_true, y_pred)

# # def neg(y_true, y_pred):
# #     return K.sum(y_true[y_true==0], axis=0)

# def fpr(y_true, y_pred):
#     return K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0) #/ pos(y_true, y_pred)

# def fnr(y_true, y_pred):
#     return K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0) / neg(y_true, y_pred)

# def tpr(y_true, y_pred):
#     return K.sum(K.cast(y_true*y_pred, 'float'), axis=0) / pos(y_true, y_pred)

# def tnr(y_true, y_pred):
#     return K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0) / neg(y_true, y_pred)

In [36]:
embed_size = 256

# Inputs
cat_input_layer = layers.Input(shape=(cat_train_selected.shape[1],), dtype=tf.int64)
bin_input_layer = layers.Input(shape=(bin_train_selected.shape[1],), dtype=tf.int64)
num_input_layer = layers.Input(shape=(num_train_selected.shape[1],), dtype=tf.float64)

embedding_layers = []
for i in range(cat_train_selected.shape[1]):
    num_values = len(set(cat_train_selected[:,i]))
    if num_values <= embed_size:
        embedding_layers.append(layers.Embedding(input_dim=num_values+1, output_dim=num_values, input_length=1, mask_zero=True)(cat_input_layer[:,i]))
    else:
        embedding_layers.append(layers.Embedding(input_dim=num_values+1, output_dim=embed_size, input_length=1, mask_zero=True)(cat_input_layer[:,i]))

bin_dense_layer = layers.Dense(64, activation='relu')(bin_input_layer)
num_dense_layer = layers.Dense(64, activation='relu')(num_input_layer)

# Concat all inputs
concatted = tf.keras.layers.Concatenate()([bin_dense_layer, num_dense_layer, *embedding_layers])

# Hidden layers
hidden_layer_1 = layers.Dense(500, activation='relu')(concatted)
hidden_layer_2 = layers.Dense(250, activation='relu')(hidden_layer_1)
hidden_layer_3 = layers.Dense(50, activation='relu')(hidden_layer_2)
hidden_layer_4 = layers.Dense(100, activation='relu')(hidden_layer_3)
hidden_layer_5 = layers.Dense(40, activation='relu')(hidden_layer_4)

# Outputs
output_bias = tf.keras.initializers.Constant(initial_bias)

#output_1 = layers.Dense(1, activation='sigmoid', name="is_clicked", bias_initializer=output_bias)(hidden_layer_5) # If we want to predict "is_clicked", use this output (give two outputs to the model instead of one).
output_2 = layers.Dense(1, activation="sigmoid", name="is_installed", bias_initializer=output_bias)(hidden_layer_5)

# Create model
model = keras.Model(inputs=[cat_input_layer, bin_input_layer, num_input_layer], outputs=[output_2])
# model = keras.Model(inputs=[cat_input_layer, bin_input_layer, num_input_layer], outputs=[output_1, output_2]) # If we want to predict "is_clicked" and "is_installed"

# Compile model
batch_size = 5000 
learning_rate=0.001
optimizer = keras.optimizers.Adam(lr=learning_rate)

if len(model.outputs)>1:
    monitor_name = 'val_is_installed_loss'
else:
    monitor_name = "val_loss"
early_stopping = tf.keras.callbacks.EarlyStopping(monitor=monitor_name, patience=3)
mcp_save = tf.keras.callbacks.ModelCheckpoint(filepath= chckpt_path + '{epoch:04d}', save_best_only=True, save_weights_only=False, monitor='val_loss', mode='min', save_freq='epoch')
# mcp_save = tf.keras.callbacks.ModelCheckpoint(filepath= chckpt_path + '{epoch:04d}', save_best_only=False, save_weights_only=False, monitor='val_loss', mode='min', save_freq='epoch')

acc = tf.metrics.BinaryAccuracy(threshold=0.5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', acc])  



In [16]:
# tf.keras.utils.plot_model(model, "model.png")

In [37]:
model.fit((cat_train_selected, bin_train_selected, num_train_selected), y_train_is_installed, epochs = 50, batch_size=batch_size, 
          validation_data=((cat_val_selected, bin_val_selected, num_val_selected), y_val_is_installed),
          callbacks=[early_stopping, mcp_save])

# model.fit((cat_train_selected, bin_train_selected, num_train_selected), y_train_is_installed, epochs = 50, batch_size=batch_size, 
#           validation_data=((cat_val_selected, bin_val_selected, num_val_selected), y_val_is_installed),
#           callbacks=[mcp_save])

Epoch 1/50


INFO:tensorflow:Assets written to: ./models/0001/assets


Epoch 2/50


INFO:tensorflow:Assets written to: ./models/0002/assets


Epoch 3/50

KeyboardInterrupt: 

In [18]:
list_of_models= glob.glob(chckpt_path+"/*") # * means all if need specific format then *.csv
latest_model = max(list_of_models, key=os.path.getctime)
# print(latest_file)
model = tf.keras.models.load_model(latest_model)

In [19]:
model.evaluate((cat_val_selected, bin_val_selected, num_val_selected), np.expand_dims(y_val_is_installed.values.astype('float32'), axis=1), batch_size=batch_size)



[0.3177054226398468, 0.8645771145820618, 0.8645771145820618]

In [20]:
y_pred = model.predict((cat_val_selected, bin_val_selected, num_val_selected), batch_size=batch_size)



In [21]:
ba = tf.metrics.BinaryAccuracy()
ba(np.expand_dims(y_val_is_installed.values, axis=1),y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=0.82069117>

In [22]:
y_true = y_val_is_installed
y_pred = model.predict((cat_val_selected, bin_val_selected, num_val_selected), batch_size=batch_size)
y_pred[y_pred>=0.5]=1
y_pred[y_pred<0.5]=0
y_pred=np.squeeze(y_pred)
y_true=np.squeeze(y_true.values.astype('float32'))
np.sum(np.equal(y_pred,y_true))/len(y_pred)



0.8206911825286902

In [23]:
sum(y_true==0)/len(y_true)

0.8264504631866184

In [24]:
sum(y_train_is_installed==0)/len(y_train_is_installed)

0.8258254605569408

In [25]:
y_pred_train=model.predict((cat_train_selected,bin_train_selected,num_train_selected), batch_size=batch_size)
ba = tf.metrics.BinaryAccuracy()
ba(np.expand_dims(y_train_is_installed.values, axis=1),y_pred_train)



<tf.Tensor: shape=(), dtype=float32, numpy=0.82366127>

In [26]:
def compute_metrics(y_true, y_pred):
    y_pred[y_pred>=0.5]=1
    y_pred[y_pred<0.5]=0

    cm = confusion_matrix(y_true, y_pred)

    tn, fp, fn, tp = cm.ravel()

    tpr = round(tp / (tp+fn),4)
    fpr = round(tp / (fp+tn),4)
    tnr = round(tn / (tn+fp),4)
    fnr = round(fn / (fn+tp),4)
    acc = round((tp + tn) / (tp+fn+fp+tn),4)
    precision = round(tp / (tp + fp),4)
    f1 = round(2 * (precision * tpr) / (precision + tpr),4)
    
    return tpr, fpr, tnr, fnr, acc, precision, f1

In [27]:
from sklearn.metrics import confusion_matrix

# Val predictions
y_pred_val = model.predict((cat_val_selected, bin_val_selected, num_val_selected), batch_size=batch_size)
tpr_val, fpr_val, tnr_val, fnr_val, acc_val, precision_val, f1_val = compute_metrics(y_val_is_installed, y_pred_val)


# Val dumb predictions
y_pred_val_dumb = np.zeros(y_val_is_installed.shape)
tpr_val_dumb, fpr_val_dumb, tnr_val_dumb, fnr_val_dumb, acc_val_dumb, precision_val_dumb, f1_val_dumb = compute_metrics(y_val_is_installed, y_pred_val_dumb)

# Train predictions
y_pred_train = model.predict((cat_train_selected, bin_train_selected, num_train_selected), batch_size=batch_size)
tpr_train, fpr_train, tnr_train, fnr_train, acc_train, precision_train, f1_train = compute_metrics(y_train_is_installed, y_pred_train)

# Train dumb predictions
y_pred_train_dumb = np.zeros(y_train_is_installed.shape)
tpr_train_dumb, fpr_train_dumb, tnr_train_dumb, fnr_train_dumb, acc_train_dumb, precision_train_dumb, f1_train_dumb = compute_metrics(y_train_is_installed, y_pred_train_dumb)

# FP = cm.sum(axis=0) - np.diag(cm)  
# FN = cm.sum(axis=1) - np.diag(cm)
# TP = np.diag(cm)
# TN = cm.sum() - (FP + FN + TP)

# # Sensitivity, hit rate, recall, or true positive rate
# TPR = TP/(TP+FN)
# # Specificity or true negative rate
# TNR = TN/(TN+FP) 
# # Precision or positive predictive value
# PPV = TP/(TP+FP)
# # Negative predictive value
# NPV = TN/(TN+FN)
# # Fall out or false positive rate
# FPR = FP/(FP+TN)
# # False negative rate
# FNR = FN/(TP+FN)
# # False discovery rate
# FDR = FP/(TP+FP)

# # Overall accuracy
# ACC = (TP+TN)/(TP+FP+FN+TN)



  precision = round(tp / (tp + fp),4)




  precision = round(tp / (tp + fp),4)


In [32]:
tpr_val, fpr_val, tnr_val, fnr_val, acc_val, precision_val, f1_val

(0.5324, 0.1118, 0.8812, 0.4676, 0.8207, 0.4849, 0.5075)

In [34]:
tpr_train, fpr_train, tnr_train, fnr_train, acc_train, precision_train, f1_train, acc_train_dumb

(0.5425, 0.1144, 0.883, 0.4575, 0.8237, 0.4943, 0.5173, 0.8258)

In [33]:
tpr_val_dumb, tnr_val_dumb, acc_val_dumb, precision_val_dumb, f1_val_dumb

(0.0, 1.0, 0.8265, nan, nan)

In [29]:
print(tpr_val, fnr_val, tnr_val, fpr_val, acc_val)
print(tpr_val_dumb, fnr_val_dumb, tnr_val_dumb, fpr_val_dumb, acc_val_dumb)
print(tpr_train, fnr_train, tnr_train, fpr_train, acc_train)
print(tpr_train_dumb, fnr_train_dumb, tnr_train_dumb, fpr_train_dumb, acc_train_dumb)

0.5324 0.4676 0.8812 0.1118 0.8207
0.0 1.0 1.0 0.0 0.8265
0.5425 0.4575 0.883 0.1144 0.8237
0.0 1.0 1.0 0.0 0.8258
