In [1]:
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, DenseFeatures, Embedding, LSTM, concatenate, Conv1D, Flatten, Dropout
from tensorflow.keras.layers.experimental import preprocessing
import tensorflow as tf
import pickle

# Load data

In [2]:
filepath_train =  r'C:\Users\piotr\OneDrive\Pulpit\Julia\D80M\D80M.tsv'

Functions to save and load objects

In [3]:
def save_enc(enc, name ):
    with open('AdClick_obj/'+ name + '.pkl', 'wb+') as f:     
        pickle.dump({'config': enc.get_config(),
             'weights': enc.get_weights()}
            , f)

In [4]:
def load_enc( name ):
    with open('AdClick_obj/'+ name + '.pkl', 'rb') as f:     
        load_enc = pickle.load(f)
    new_vec = preprocessing.TextVectorization.from_config(load_enc['config'])
    new_vec.set_weights(load_enc['weights'])
    return new_vec

In [5]:
def load_obj(name ):
    with open('AdClick_obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

Load list of boundaries that will be used to bucketize data from the DisplayURL column. I have determined them in a different script with the code: seq=np.linspace(min(X_train["DisplayURL"]), max(X_train["DisplayURL"]), 12000), where X_train was the train set turned into a pandas dataframe

In [6]:
seq = load_obj('DisplayURLbucketboundaries')

Load data into tensorflow dataset in batches using make_csv_dataset, add numerical column pos_per_depth and divide the dataset into train and holdout set.

In [7]:
def add_col(X, y):
    pos_per_depth = (X['Depth'] - X['Position']) / X['Depth']
    X.update({'pos_per_depth': pos_per_depth})
    return (X, y)

In [8]:
def get_dataset(file_path, batch_size, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=batch_size,
        label_name='Click',
        num_epochs=1,
        field_delim='\t',
        shuffle=False,
        ignore_errors=True,
        **kwargs)
    new_ds = dataset.map(add_col)
    return new_ds

I use only 4 epochs because tensorflow datasets gave me some unexpected problems, so ultimately once the code was working I didn't have the time to learn the neural net for more epochs. If I had more time I would add way more epochs (ca 100)

In [9]:
NUM_EPOCHS = 4
DATASET_SIZE = 80000000
BATCH_SIZE = 256*4
train_size = int(0.98 * DATASET_SIZE / BATCH_SIZE)
SELECT_COLUMNS = ['Gender', 'Position', 'Depth', 'Age', 'AdvertiserId', 'AdId', 'DisplayURL', 'AdKeyword_tokens', 'AdDescription_tokens',
                  'AdTitle_tokens', 'Query_tokens', 'Click']

full_dataset = get_dataset(filepath_train, batch_size=BATCH_SIZE, select_columns=SELECT_COLUMNS)
#full_dataset = full_dataset.shuffle(reshuffle_each_iteration=False, buffer_size=DATASET_SIZE // BATCH_SIZE)
train_dataset = full_dataset.take(train_size)
val_dataset = full_dataset.skip(train_size)

got into make_csv_dataset_v2
None
Click
['Click', 'DisplayURL', 'AdId', 'AdvertiserId', 'Depth', 'Position', 'Gender', 'Age', 'AdKeyword_tokens', 'AdTitle_tokens', 'AdDescription_tokens', 'Query_tokens']


In order to make the multiple input model work with the data read using csv_make_dataset I had to first map each column to a dataset containing only this column and then zip them

In [10]:
#Text columns
adkeyword_train_ds = train_dataset.map(lambda x, y: x['AdKeyword_tokens'])
adkeyword_val_ds = val_dataset.map(lambda x, y: x['AdKeyword_tokens'])
addescription_train_ds = train_dataset.map(lambda x, y: x['AdDescription_tokens'])
addescription_val_ds = val_dataset.map(lambda x, y: x['AdDescription_tokens'])
adtitle_train_ds = train_dataset.map(lambda x, y: x['AdTitle_tokens'])
adtitle_val_ds = val_dataset.map(lambda x, y: x['AdTitle_tokens'])
query_train_ds = train_dataset.map(lambda x, y: x['Query_tokens'])
query_val_ds = val_dataset.map(lambda x, y: x['Query_tokens'])

#Categorical Variable columns
gender_train_ds = train_dataset.map(lambda x, y: x['Gender'])
gender_val_ds = val_dataset.map(lambda x, y: x['Gender'])
age_train_ds = train_dataset.map(lambda x, y: x['Age'])
age_val_ds = val_dataset.map(lambda x, y: x['Age'])
depth_train_ds = train_dataset.map(lambda x, y: x['Depth'])
depth_val_ds = val_dataset.map(lambda x, y: x['Depth'], )
position_train_ds = train_dataset.map(lambda x, y: x['Position'])
position_val_ds = val_dataset.map(lambda x, y: x['Position'])
AdvertiserId_train_ds = train_dataset.map(lambda x, y: x['AdvertiserId'])
AdvertiserId_val_ds = val_dataset.map(lambda x, y: x['AdvertiserId'])
AdId_train_ds = train_dataset.map(lambda x, y: x['AdId'])
AdId_val_ds = val_dataset.map(lambda x, y: x['AdId'])
url_train_ds = train_dataset.map(lambda x, y: x['DisplayURL'])
url_val_ds = val_dataset.map(lambda x, y: x['DisplayURL'])

#Numerical Columns
pospdepth_train_ds = train_dataset.map(lambda x, y: x['pos_per_depth'])
pospdepth_val_ds = val_dataset.map(lambda x, y: x['pos_per_depth'])

#Label column
y_train_ds = train_dataset.map(lambda x, y: y)
y_val_ds = val_dataset.map(lambda x, y: y)

# Create Preprocessing Layers

One hot encode categorical variables that only take a few values

In [11]:
CATEGORIES = {
    'Gender': [0, 1, 2],
    'Position': [1, 2, 3],
    'Age': [1, 2, 3, 4, 5, 6],
    'Depth': [1,2 ,3]
}

In [12]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

Use hashing for categorical variables that take a lot of values to decrease the number of columns + account for values that are present in the test set but not in the train set

In [13]:
HASH_CATEGORIES = {'AdId':250000, 'AdvertiserId':8000}

In [14]:
for feature, hash_bucket_size in HASH_CATEGORIES.items():
    cat_hashed = tf.feature_column.categorical_column_with_hash_bucket(
      feature, hash_bucket_size=hash_bucket_size, dtype=tf.int64)
    categorical_columns.append(tf.feature_column.indicator_column(cat_hashed))

I had trouble hashing DisplayURL because categorical_column_with_hash_bucket only accepts integer or string columns, so I decided to bucketize it instead.

In [15]:
buckets =seq.tolist()

In [16]:
url = tf.feature_column.numeric_column('DisplayURL')
cat_bucketed=tf.feature_column.bucketized_column(
    url, buckets)
categorical_columns.append(tf.feature_column.indicator_column(cat_bucketed))

Now I create the preprocessing layer for the columns containing tokens. I tokenize them using TextVectorization and then embed them

In [17]:
def my_split(input):
    return tf.strings.split(input, sep='|')

In [18]:
#checked the maximum number of tokens per record in a different script
word_count_keyword= 15
word_count_description = 50
word_count_title = 30
word_count_query = 128

# Zmniejszyc do 1000
VOCAB_SIZE = 10000

In [19]:
def vectorize_ds(text_ds, word_count):
    encoder = preprocessing.TextVectorization(split=my_split, output_mode="int", max_tokens=VOCAB_SIZE, output_sequence_length=word_count)
    encoder.adapt(text_ds)
    return encoder

In [20]:
encoder_keyword = vectorize_ds(adkeyword_train_ds, word_count_keyword)
encoder_description = vectorize_ds(addescription_train_ds, word_count_description)
encoder_title= vectorize_ds(adtitle_train_ds, word_count_title)
encoder_query = vectorize_ds(query_train_ds, word_count_query)

In [21]:
def create_text_preprocessing_layers(name, encoder):
    text_input = Input(shape=(None,), name=name, dtype=tf.string)
    text_features = encoder(text_input)
    text_features = Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True)(text_features)

# #     Zmienic to na flatten bez convow
#     text_features=Conv1D(64, 3, padding='same')(text_features)
#     text_features=Conv1D(64, 3, padding='same')(text_features)
#     text_features=Conv1D(64, 3, padding='same')(text_features)
    text_features = Flatten()(text_features)
    text_features = tf.keras.Model(inputs=text_input, outputs=text_features)
#     filepath = 'AdClick_obj/'+ name
#     text_features.save(filepath, save_format="tf")
    return text_features

# Build model

Now I build a multiple input model. String variables go through the text preprocessing layers, while categorical variables go through the preprocessing layer (one hot encoding/ hashing). The layers are concatenated and then I just use Dense layers and a few Dropout layers to avoid overfitting.

In [22]:
def build_model():
    keywords_model = create_text_preprocessing_layers("Adkeyword", encoder_keyword)
    description_model = create_text_preprocessing_layers("Addescription", encoder_description)
    title_model = create_text_preprocessing_layers("Adtitle", encoder_title)
    query_model = create_text_preprocessing_layers("Query", encoder_query)

    
    feature_layer_inputs = {}
    for header in ['Gender', 'Position', 'Age', 'Depth', 'AdvertiserId', 'AdId', 'DisplayURL']:
        feature_layer_inputs[header] = Input(shape=(1,), name=header, dtype=tf.int64)

    cat_features = DenseFeatures(categorical_columns)(feature_layer_inputs)
    cat_features = tf.keras.Model(inputs=feature_layer_inputs, outputs=cat_features)

    combined = concatenate([keywords_model.output, description_model.output, title_model.output, query_model.output,
                            cat_features.output])


    x = Dense(128, activation ='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(64)(x)
    x = Dropout(0.5)(x)
#     x = Dense(32, activation = 'relu')(combined)
    x = Dense(10, activation = 'relu')(x)
    x = Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(
        inputs=[keywords_model.input, description_model.input, title_model.input, query_model.input, feature_layer_inputs],
        outputs=x)

    model.compile(loss=tf.keras.losses.binary_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['AUC'])
    model.summary()
    return model


In [23]:
model = build_model()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Adkeyword (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
Addescription (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Adtitle (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
Query (InputLayer)              [(None, None)]       0                                            
____________________________________________________________________________________________

In [24]:
#tf.keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True)

In [25]:
# The early stopping callback isn't important anymore because I set the number of epochs to 4
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_auc",
        min_delta=1e-5,
        mode='max',
        patience=5,
        verbose=1,
        restore_best_weights=True)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    r"C:\Users\piotr\OneDrive\Pulpit\Julia\saved_models\weights2.{epoch:02d}-{val_auc:.2f}",
    monitor="val_auc",
    save_best_only=True,
    verbose=1,
    mode="max"
    
)

# Fit model and make predictions on test set

In [26]:
data_in = tf.data.Dataset.zip((adkeyword_train_ds, addescription_train_ds, adtitle_train_ds, query_train_ds, gender_train_ds, position_train_ds, age_train_ds, depth_train_ds, AdvertiserId_train_ds, AdId_train_ds, url_train_ds))
dataset = tf.data.Dataset.zip((data_in, y_train_ds))

In [27]:
data_val = tf.data.Dataset.zip((adkeyword_val_ds, addescription_val_ds, adtitle_val_ds, query_val_ds, gender_val_ds, position_val_ds, age_val_ds, depth_val_ds, AdvertiserId_val_ds, AdId_val_ds, url_val_ds))
dataset_val = tf.data.Dataset.zip((data_val, y_val_ds))

In [28]:
print("Num GPUs Available", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available 1


In [29]:
model.fit(dataset,
          epochs=NUM_EPOCHS,
          validation_data = dataset_val,
          callbacks=[early_stopping_callback, model_checkpoint_callback])

Epoch 1/4

Epoch 00001: val_auc improved from -inf to 0.72188, saving model to C:\Users\piotr\OneDrive\Pulpit\Julia\saved_models\weights2.01-0.72
INFO:tensorflow:Assets written to: C:\Users\piotr\OneDrive\Pulpit\Julia\saved_models\weights2.01-0.72\assets
Epoch 2/4

Epoch 00002: val_auc improved from 0.72188 to 0.72335, saving model to C:\Users\piotr\OneDrive\Pulpit\Julia\saved_models\weights2.02-0.72
INFO:tensorflow:Assets written to: C:\Users\piotr\OneDrive\Pulpit\Julia\saved_models\weights2.02-0.72\assets
Epoch 3/4

Epoch 00003: val_auc did not improve from 0.72335
Epoch 4/4

Epoch 00004: val_auc did not improve from 0.72335


<tensorflow.python.keras.callbacks.History at 0x246d8406100>

In [30]:
filepath_test =  r'C:\Users\piotr\OneDrive\Pulpit\Julia\D5M_test_x\D5M_test_x.tsv'

I think shuffling is turned off :) I had to add column_defaults here because tensorflow would recognize some token columns as int instead of string

In [31]:
test_dataset = get_dataset(filepath_test, batch_size=BATCH_SIZE, select_columns=SELECT_COLUMNS, column_defaults = [tf.int32, tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.string, tf.string, tf.string, tf.string])

got into make_csv_dataset_v2
None
Click
['Click', 'DisplayURL', 'AdId', 'AdvertiserId', 'Depth', 'Position', 'Gender', 'Age', 'AdKeyword_tokens', 'AdTitle_tokens', 'AdDescription_tokens', 'Query_tokens']


In [32]:
test_dataset

<MapDataset shapes: (OrderedDict([(DisplayURL, (None,)), (AdId, (None,)), (AdvertiserId, (None,)), (Depth, (None,)), (Position, (None,)), (Gender, (None,)), (Age, (None,)), (AdKeyword_tokens, (None,)), (AdTitle_tokens, (None,)), (AdDescription_tokens, (None,)), (Query_tokens, (None,)), (pos_per_depth, (None,))]), (None,)), types: (OrderedDict([(DisplayURL, tf.float32), (AdId, tf.int32), (AdvertiserId, tf.int32), (Depth, tf.int32), (Position, tf.int32), (Gender, tf.int32), (Age, tf.int32), (AdKeyword_tokens, tf.string), (AdTitle_tokens, tf.string), (AdDescription_tokens, tf.string), (Query_tokens, tf.string), (pos_per_depth, tf.float64)]), tf.int32)>

In [33]:
adkeyword_test_ds = test_dataset.map(lambda x, y: x['AdKeyword_tokens'])
addescription_test_ds = test_dataset.map(lambda x, y: x['AdDescription_tokens'])
adtitle_test_ds = test_dataset.map(lambda x, y: x['AdTitle_tokens'])
query_test_ds = test_dataset.map(lambda x, y: x['Query_tokens'])

#Categorical Variable columns
gender_test_ds = test_dataset.map(lambda x, y: x['Gender'])
age_test_ds = test_dataset.map(lambda x, y: x['Age'])
depth_test_ds = test_dataset.map(lambda x, y: x['Depth'])
position_test_ds = test_dataset.map(lambda x, y: x['Position'])
AdvertiserId_test_ds = test_dataset.map(lambda x, y: x['AdvertiserId'])
AdId_test_ds = test_dataset.map(lambda x, y: x['AdId'])
url_test_ds = test_dataset.map(lambda x, y: x['DisplayURL'])

#Numerical Columns
pospdepth_test_ds = test_dataset.map(lambda x, y: x['pos_per_depth'])

#Label column
y_test_ds = test_dataset.map(lambda x, y: y)

In [34]:
data_test = tf.data.Dataset.zip((adkeyword_test_ds, addescription_test_ds, adtitle_test_ds, query_test_ds, gender_test_ds, position_test_ds, age_test_ds, depth_test_ds, AdvertiserId_test_ds, AdId_test_ds, url_test_ds))
dataset_test = tf.data.Dataset.zip((data_test, y_test_ds))

In [35]:
score = model.predict(dataset_test)


In [36]:
with open('test_predicitons_1702', 'a') as file:
    for item in score:
        file.write("%s\n" % item[0])