In [None]:
# first try of recommendation system model creation


"""imports"""

import pandas as pd
import numpy as np
import re
import time
from logger import logger
import os
import string

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras import models, layers, utils, Model, optimizers, activations, callbacks, losses, metrics
from keras import backend as K

# allow to display ALL columns from dataframe
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)


In [None]:
"""load"""

# base path to data
base_path = r"F:/large_data/BX-CSV-Dump/"

# load all files
rating_data = pd.read_csv(
    base_path+"BX-Book-Ratings.csv", sep=";",
)
user_data = pd.read_csv(
    base_path+"BX-Users.csv", sep=";",
)
books_data = pd.read_csv(
    base_path+"BX-Books.csv", sep=r'";"', low_memory=True, engine="python",
    usecols=['"ISBN', "Book-Title", "Book-Author", "Year-Of-Publication", "Publisher"],
)

# show some data
# print(rating_data)
# print(rating_data.describe())
# print(user_data)
# print(user_data.describe())
# print(books_data)
# print(books_data.describe())


"""pre-processing of data"""


# merge
# rating_data = rating_data.merge(books_data, how="left", left_on="ISBN", right_index=True)
# print(rating_data)


"""books_data processing"""

# clear "Year-Of-Publication" column
books_data["Year-Of-Publication"] = books_data["Year-Of-Publication"].fillna(0)
books_data["Year-Of-Publication"] = books_data["Year-Of-Publication"]\
    .apply(lambda x: int(x) if len(re.findall(r"[^a-z]", f"{x}".lower())) == 4 else 0).astype("int64")


# rename "ISBN column to ISBN
books_data = books_data.rename(columns={'"ISBN': 'ISBN'})

# delete excess first symbol
books_data["ISBN"] = books_data["ISBN"].apply(lambda x: str(x)[1:])

# add indexes as books numbers column
books_data["ISBN-Encoded"] = books_data.index.astype("int64")


# fill authors NaNs
books_data["Book-Author"] = books_data["Book-Author"].fillna("None")

# create dict and encode all book authors to numbers
unique_authors = books_data["Book-Author"].unique()
author_to_number = {author: number for number, author in enumerate(unique_authors)}
number_to_author = {number: author for number, author in enumerate(unique_authors)}
del unique_authors

# add encoded authors as column
books_data["Book-Author-Encoded"] = books_data["Book-Author"].apply(lambda x: int(author_to_number[x])).astype("int64")
# print(books_data["Book-Author-Encoded"])


# fill Publisher NaNs
books_data["Publisher"] = books_data["Publisher"].fillna("None")

# create dict and encode all book Publishers to numbers
unique_publisher = books_data["Publisher"].unique()
publisher_to_number = {publisher: number for number, publisher in enumerate(unique_publisher)}
number_to_publisher = {number: publisher for number, publisher in enumerate(unique_publisher)}

# add encoded publishers as column
books_data["Publisher-Encoded"] = books_data["Publisher"].apply(lambda x: int(publisher_to_number[x])).astype("int64")
# print(books_data["Publisher-Encoded"])


# fill Book-Title NaNs
books_data["Book-Title"] = books_data["Book-Title"].fillna("None")

# process words in titles
books_data["Book-Title-Encoded"] = books_data["Book-Title"].apply(lambda x: re.sub(r"[^a-z ]", "", x.lower()).strip())

# create dicts for symbols encoding
symbol_to_number = {symbol: number+1 for number, symbol in enumerate(string.ascii_lowercase+" ")}
number_to_symbol = {number+1: symbol for number, symbol in enumerate(string.ascii_lowercase+" ")}

# encode all titles on chars level
books_data["Book-Title-Encoded"] = books_data["Book-Title-Encoded"].apply(lambda x: list(symbol_to_number[char]
                                                                                         for char in str(x)))
# extend them to one len
books_data["Book-Title-Encoded"] = list(utils.pad_sequences(sequences=books_data["Book-Title-Encoded"], value=0))
# print(books_data["Book-Title-Encoded"])


# show all processed books_data in one
# print(books_data[["Year-Of-Publication", "ISBN-Encoded", "Book-Author-Encoded", "Publisher-Encoded", "Book-Title-Encoded"]])
# print(books_data[["Year-Of-Publication", "ISBN-Encoded", "Book-Author-Encoded", "Publisher-Encoded", "Book-Title-Encoded"]].describe())


"""user_data processing"""

# fill NaNs in Age column with 0
user_data["Age"] = user_data["Age"].fillna(0)

# standardize and scale age in renge (0., 1.)
user_data["Age-Encoded"] = preprocessing.MinMaxScaler().fit_transform(np.asarray(user_data["Age"]).reshape(-1, 1)).reshape(-1,)
# print(user_data["Age-Encoded"])


# fill NaNs in Location column
user_data["Location"] = user_data["Location"].fillna("n/a")

# split location string by ", " and reshape to (-1,)
user_data["Location-Encoded"] = user_data["Location"].apply(lambda x: re.split(", ", x.lower().strip()))
# flatten data to 1-d array
user_data_unique_locations = np.asarray([element for sublist in user_data["Location-Encoded"] for element in sublist])
# take all unique locations
user_data_unique_locations = np.unique(user_data_unique_locations.reshape(-1,).astype("str"), axis=-1)
# add blank value to list as first element
user_data_unique_locations = np.append(user_data_unique_locations, ["blank_value"], axis=-1)[::-1]

# create dicts for users locations
location_to_number = {location: number for number, location in enumerate(user_data_unique_locations)}
number_to_location = {number: location for number, location in enumerate(user_data_unique_locations)}
# del user_data_unique_locations

# encode all users locations
user_data["Location-Encoded"] = user_data["Location-Encoded"].apply(lambda x: list(location_to_number[element]
                                                                                   for element in x))
# extend all location sequences to one len
user_data["Location-Encoded"] = list(utils.pad_sequences(sequences=user_data["Location-Encoded"],
                                                         value=location_to_number["blank_value"]))


# show all processed user_data in one
# print(user_data[["Age-Encoded", "Location-Encoded"]])
# print(user_data[["Age-Encoded", "Location-Encoded"]].describe())


"""rating_data processing and merging it all together"""

# standardize and scale books ratings in range (0.5, 1)
rating_data["Book-Rating-Encoded"] = preprocessing.MinMaxScaler(feature_range=(0., 1.))\
    .fit_transform(np.asarray(rating_data["Book-Rating"]).reshape(-1, 1)).reshape(-1,)
# print(rating_data["Book-Rating-Encoded"])


# re-give type to all columns
rating_data["ISBN"], books_data["ISBN"] = rating_data["ISBN"].astype("str"), books_data["ISBN"].astype("str")
rating_data["User-ID"], rating_data["Book-Rating"] = rating_data["User-ID"].astype("int64"), rating_data["Book-Rating"].astype("float64")
rating_data["Book-Rating-Encoded"] = rating_data["Book-Rating-Encoded"].astype("float64")
user_data["User-ID"], user_data["Location"] = user_data["User-ID"].astype("int64"), user_data["Location"].astype("str")
user_data["Age"], user_data["Age-Encoded"] = user_data["Age"].astype("float64"), user_data["Age-Encoded"].astype("float64")
user_data["Location-Encoded"] = user_data["Location-Encoded"].astype("object")
books_data["Book-Title"], books_data["Book-Author"] = books_data["Book-Title"].astype("str"), books_data["Book-Author"].astype("str")
books_data["Publisher"] = books_data["Publisher"].astype("str")

# merge all dataframes to one by rating_data
rating_data = pd.merge(left=rating_data, right=user_data, how="left", on="User-ID")
rating_data = pd.merge(left=rating_data, right=books_data, how="left", on="ISBN")

# delete irrelevant dataframes
del user_data, books_data

# delete all rows with NaNs
rating_data = rating_data.drop(rating_data[rating_data.isnull().any(axis=1)].index)



In [None]:
"""final data"""

# take only rows with rating >= 0.1
rating_data = rating_data.loc[rating_data['Book-Rating-Encoded'] >= 0.1]

# take only rows where products have >= 20 reviews
rating_data = rating_data.groupby("ISBN-Encoded").filter(lambda x: len(x) >= 20)
# # take only rows with users who make >= 20 reviews
rating_data = rating_data.groupby("User-ID").filter(lambda x: len(x) >= 20)

# # take part of data with only threshold_value users and threshold_value books
# threshold_value = 50000
# rating_data = rating_data.loc[rating_data['ISBN-Encoded'] <= threshold_value]
# rating_data = rating_data.loc[rating_data['User-ID'] <= threshold_value]


# shuffle all rows in dataframe
rating_data = rating_data.sample(frac=1).reset_index(drop=True)

# show final full dataframe
print(rating_data)
print(rating_data.describe())

# # take and show only useful for work of model data
# useful_data = rating_data[["User-ID", "ISBN-Encoded", "Book-Rating-Encoded", "Age-Encoded", "Location-Encoded",
#                            "Year-Of-Publication", "Book-Author-Encoded", "Publisher-Encoded", "Book-Title-Encoded"]]
# print(useful_data)




In [None]:
"""collaborative filtering recommendation system model creation"""


# # # model params

# # invariable model params from data
# count of books in dataset
product_count = len(rating_data["ISBN-Encoded"].unique()) + 1
# count of users
users_count = len(rating_data["User-ID"].unique()) + 1
# user location shape
location_shape = np.asarray(rating_data["Location-Encoded"].iloc[1]).shape
# number of unique location
location_count = len(location_to_number) + 1
# number of unique books publication years
years_count = len(rating_data["Year-Of-Publication"].unique()) + 1
# books authors count
author_count = len(rating_data["Book-Author-Encoded"].unique()) + 1
# books publisher count
publisher_count = len(rating_data["Publisher-Encoded"].unique()) + 1
# book title shape
title_shape = np.asarray(rating_data["Book-Title-Encoded"].iloc[1]).shape
# book title unique chars count
title_chars_count = len(symbol_to_number) + 1

# # variable model params
# age dims
age_dims = 1
# year embedding
year_embedding = 2
# user embedding
user_embedding = 16
# product embedding
product_embedding = 16
# location embedding
location_embedding = 8
# author embedding
author_embedding = 4
# publisher embedding
publisher_embedding = 4
# title embedding
title_embedding = 2
# dim of merged metadata dense
metadata_dense_dim = 1
# merged dim units
final_dense_dim = 16

print(f"product_count: {product_count} ; users_count: {users_count} ; location_shape: {location_shape} ; "
      f"location_count: {location_count} ; years_count: {years_count}; author_count: {author_count} ; "
      f"publisher_count: {publisher_count} ; title_shape: {title_shape} ; title_chars_count: {title_chars_count}")

print(f"age_dims: {age_dims} ; year_embedding: {year_embedding} ; user_embedding: {user_embedding} ; "
      f"product_embedding: {product_embedding} ; "
      f"location_embedding: {location_embedding} ; author_embedding: {author_embedding} ; "
      f"publisher_embedding: {publisher_embedding} ; title_embedding: {title_embedding} ; "
      f"final_dense_dim: {final_dense_dim}")


# # # user-product part

# # user-product inputs layers
user_id_input = layers.Input(name="user_id_input", shape=(1,), dtype="int64")
product_id_input = layers.Input(name="product_id_input", shape=(1,), dtype="int64")

# # Matrix Factorization branch

# user embeddings and reshape
mf_user_x = layers.Embedding(input_dim=users_count, output_dim=user_embedding, input_length=1,
                                     name="mf_user_embedding")(user_id_input)
mf_user_x = layers.Reshape(target_shape=(user_embedding,),
                                 name="mf_user_reshape")(mf_user_x)
# books embedding and reshape
mf_product_x = layers.Embedding(input_dim=product_count, output_dim=product_embedding, input_length=1,
                                        name="mf_product_embedding")(product_id_input)
mf_product_x = layers.Reshape(target_shape=(product_embedding,),
                                    name="mf_product_reshape")(mf_product_x)
# calculate .dot() product
mf_output = layers.Dot(normalize=True, axes=-1, name='mf_dot_output')([mf_user_x, mf_product_x])

# # Neural Network branch

# user embeddings and reshape
nn_user_x = layers.Embedding(input_dim=users_count, output_dim=user_embedding, input_length=1,
                                     name="nn_user_embedding")(user_id_input)
nn_user_x = layers.Reshape(target_shape=(user_embedding,),
                                 name="nn_user_reshape")(nn_user_x)
# books embedding and reshape
nn_product_x = layers.Embedding(input_dim=product_count, output_dim=product_embedding, input_length=1,
                                        name="nn_product_embedding")(product_id_input)
nn_product_x = layers.Reshape(target_shape=(product_embedding,),
                                    name="nn_product_reshape")(nn_product_x)
# concatenate and dense
nn_x = layers.Concatenate(axis=-1, name="nn_concatenate")([nn_user_x, nn_product_x])
nn_output = layers.Dense(name="nn_output", units=product_embedding // 2, activation="selu", kernel_initializer="lecun_normal",)(nn_x)


# # # context data part

# # age branch

# age input
age_input = layers.Input(name="user_age_input", shape=(1,), dtype="float32")
# age dense
age_output = layers.Dense(units=age_dims, activation="selu", kernel_initializer="lecun_normal", name="user_age_output")(age_input)

# # location branch

# location input
location_input = layers.Input(name="user_location_input", shape=location_shape, dtype="int64")
# location embedding
location_x = layers.Embedding(input_dim=location_count, output_dim=location_embedding, input_length=location_shape[0],
                              mask_zero=True, name="location_embedding")(location_input)
# Conv1D and Dense
location_x = layers.Conv1D(filters=location_embedding // 2, kernel_size=1, padding="same", activation="selu",
                           kernel_initializer="lecun_normal", name="user_location_Conv1D")(location_x)
location_x = layers.GlobalAveragePooling1D()(location_x)
location_output = layers.Dense(units=location_embedding // 2, activation="selu", kernel_initializer="lecun_normal", name="user_location_output")(location_x)

# # Year Of Publication branch

# year input
year_input = layers.Input(name="book_year_input", shape=(1,), dtype="int64")
# year embedding
year_x = layers.Embedding(input_dim=years_count, output_dim=year_embedding, input_length=1,
                          name="book_year_embedding")(year_input)
year_x = layers.Reshape(target_shape=(year_embedding,), name="book_year_reshape")(year_x)
# year dense
year_output = layers.Dense(units=year_embedding // 2, activation="selu", kernel_initializer="lecun_normal", name="book_year_output")(year_x)

# # Book Author branch

# author input
author_input = layers.Input(name="book_author_input", shape=(1,), dtype="int64")
# author embedding
author_x = layers.Embedding(input_dim=author_count, output_dim=author_embedding, input_length=1,
                            name="book_author_embedding")(author_input)
author_x = layers.Reshape(target_shape=(author_embedding,), name="book_author_reshape")(author_x)
# dense
author_output = layers.Dense(name="book_author_output", units=author_embedding // 2, activation="selu", kernel_initializer="lecun_normal",)(author_x)

# # book Publisher branch

# publisher input
publisher_input = layers.Input(name="book_publisher_input", shape=(1,), dtype="int64")
# publisher embedding
publisher_x = layers.Embedding(input_dim=publisher_count, output_dim=publisher_embedding, input_length=1,
                            name="book_publisher_embedding")(publisher_input)
publisher_x = layers.Reshape(target_shape=(publisher_embedding,), name="book_publisher_reshape")(publisher_x)
# dense
publisher_output = layers.Dense(name="book_publisher_output", units=publisher_embedding // 2,
                                activation="selu", kernel_initializer="lecun_normal",)(publisher_x)

# # Book Title branch

# title input
title_input = layers.Input(name="book_title_input", shape=title_shape, dtype="int64")
# title embedding
title_x = layers.Embedding(input_dim=title_chars_count, output_dim=title_embedding, input_length=title_shape[0],
                              mask_zero=True, name="book_title_embedding")(title_input)
# Conv1D and Dense
title_x = layers.Conv1D(filters=title_embedding // 2, kernel_size=1, padding="same", activation="selu",
                        kernel_initializer="lecun_normal", name="book_title_Conv1D")(title_x)
title_x = layers.GlobalAveragePooling1D()(title_x)
title_output = layers.Dense(units=title_embedding // 2, activation="selu", kernel_initializer="lecun_normal", name="book_title_output")(title_x)


# # # final concatenation and rating prediction part

# # concatenate and process metadata
merged_x = layers.Concatenate(axis=-1, name="concatenate_metadata")(
    [age_output, location_output, year_output, author_output, publisher_output, title_output])
# merged_x = layers.Dense(units=metadata_dense_dim, activation="selu", kernel_initializer="lecun_normal", name="merged_metadata_dense")(merged_x)

# # concatenate all data together
merged_x = layers.Concatenate(axis=-1, name="concatenate_all_data")(
    [mf_output, nn_output, merged_x])
print(f"merged_x.shape: {merged_x.shape}")

# # process all data together
merged_x = layers.Dense(units=final_dense_dim, activation="selu", kernel_initializer="lecun_normal", name="merged_data_dense")(merged_x)

# dropout
# merged_x = layers.Dropout(0.25)(merged_x)

# # rating prediction output
rating_output = layers.Dense(units=1, activation="sigmoid", name="model_output")(merged_x)  # , activation="sigmoid"


# # # model building and compilation

# # build
rs_model = Model(inputs={"user_id_input": user_id_input, "product_id_input": product_id_input, "age_input": age_input,
                         "location_input": location_input, "year_input": year_input, "author_input": author_input,
                         "publisher_input": publisher_input, "title_input": title_input},
                 outputs={"rating_output": rating_output},
                 name="recommendation_system_model")
rs_model.summary()

# # compile
rmse = metrics.RootMeanSquaredError(name="rmse")
rs_model.compile(optimizer=optimizers.Adam(learning_rate=0.001), loss="mae", metrics=["mape", rmse, "mse"])






In [None]:
"""model train"""

# split data to train-test splits
train_data, test_data = train_test_split(rating_data, test_size=0.2, shuffle=True)  # .iloc[:200000]
train_data, test_data = train_data.reset_index(drop=True), test_data.reset_index(drop=True)
test_data = test_data.iloc[:10000]

# callbacks
callbacks_list = [
    callbacks.ModelCheckpoint(filepath="models/model_1.0.h5", monitor="val_loss", save_best_only=True,
                              save_weights_only=True),
    # callbacks.EarlyStopping(monitor="val_loss", patience=10),
    # callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, min_delta=0.0001, min_lr=0.00000001),
    callbacks.TensorBoard(log_dir="tensorboard/model_1.0_3"),
]
# train process
history = rs_model.fit(
    x={
        "user_id_input": np.asarray(train_data["User-ID"]).astype("int64"),
        "product_id_input": np.asarray(train_data["ISBN-Encoded"]).astype("int64"),
        "age_input": np.asarray(train_data["Age-Encoded"]).astype("float32"),
        "location_input": np.stack(np.asarray(train_data["Location-Encoded"])).astype("int64"),
        "year_input": np.asarray(train_data["Year-Of-Publication"]).astype("int64"),
        "author_input": np.asarray(train_data["Book-Author-Encoded"]).astype("int64"),
        "publisher_input": np.asarray(train_data["Publisher-Encoded"]).astype("int64"),
        "title_input": np.stack(np.asarray(train_data["Book-Title-Encoded"])).astype("int64")
    },
    y={
        "rating_output": np.asarray(train_data["Book-Rating-Encoded"]).astype("float32")
    },
    batch_size=75, epochs=20, validation_split=0.2, callbacks=callbacks_list
)

# show train history
# title
plt.suptitle("history of model training")
# set context
sns.set_context(context="notebook", font_scale=1.0)
sns.set_style(style="darkgrid", rc={'grid.color': '.5'})
# plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot(history.history['mape'])
plt.plot(history.history['val_mape'])
plt.plot(history.history['rmse'])
plt.plot(history.history['val_rmse'])
plt.plot(history.history['mse'])
plt.plot(history.history['val_mse'])
# legend
plt.legend(['train_loss', 'validation_loss', 'train_mape', 'validation_mape', 'train_rmse', 'validation_rmse',
            'train_mse', 'validation_mse'])
# show
plt.show()




In [None]:
# load model weights
# rs_model.load_weights("models/model_1.0.h5")

# model evaluation
mae, mape, rmse, mse = rs_model.evaluate(x={
        "user_id_input": np.asarray(test_data["User-ID"]).astype("int64"),
        "product_id_input": np.asarray(test_data["ISBN-Encoded"]).astype("int64"),
        "age_input": np.asarray(test_data["Age-Encoded"]).astype("float32"),
        "location_input": np.stack(np.asarray(test_data["Location-Encoded"])).astype("int64"),
        "year_input": np.asarray(test_data["Year-Of-Publication"]).astype("int64"),
        "author_input": np.asarray(test_data["Book-Author-Encoded"]).astype("int64"),
        "publisher_input": np.asarray(test_data["Publisher-Encoded"]).astype("int64"),
        "title_input": np.stack(np.asarray(test_data["Book-Title-Encoded"])).astype("int64")
    },
    y={
        "rating_output": np.asarray(test_data["Book-Rating-Encoded"]).astype("float32")
    }
)
print(f"mae: {mae} ; mape: {mape} ; rmse: {rmse} ; mse: {mse}")


# show predictions examples
prediction = rs_model.predict(x={
        "user_id_input": np.asarray(test_data["User-ID"]).astype("int64"),
        "product_id_input": np.asarray(test_data["ISBN-Encoded"]).astype("int64"),
        "age_input": np.asarray(test_data["Age-Encoded"]).astype("float32"),
        "location_input": np.stack(np.asarray(test_data["Location-Encoded"])).astype("int64"),
        "year_input": np.asarray(test_data["Year-Of-Publication"]).astype("int64"),
        "author_input": np.asarray(test_data["Book-Author-Encoded"]).astype("int64"),
        "publisher_input": np.asarray(test_data["Publisher-Encoded"]).astype("int64"),
        "title_input": np.stack(np.asarray(test_data["Book-Title-Encoded"])).astype("int64")
    }
)

for c1, prediction_e in enumerate(prediction["rating_output"]):
    print(f"prediction: {round(float(prediction_e), 4)}")
    print(f"real: {round(float(np.asarray(test_data['Book-Rating-Encoded'].iloc[c1]).astype('float32')), 4)}")