In [None]:
# first try of recommendation system model creation


"""imports"""

import pandas as pd
import numpy as np
import re
import time
from logger import logger
import os
import string

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras import models, layers, utils, Model, optimizers, activations, callbacks, losses, metrics
from keras import backend as K

# allow to display ALL columns from dataframe
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)


In [None]:
"""load"""

# base path to data
base_path = r"F:/large_data/BX-CSV-Dump/"

# load all files
rating_data = pd.read_csv(
    base_path+"BX-Book-Ratings.csv", sep=";",
)
user_data = pd.read_csv(
    base_path+"BX-Users.csv", sep=";",
)
books_data = pd.read_csv(
    base_path+"BX-Books.csv", sep=r'";"', low_memory=True, engine="python",
    usecols=['"ISBN', "Book-Title", "Book-Author", "Year-Of-Publication", "Publisher"],
)

# show some data
# print(rating_data)
# print(rating_data.describe())
# print(user_data)
# print(user_data.describe())
# print(books_data)
# print(books_data.describe())


"""pre-processing of data"""


# merge
# rating_data = rating_data.merge(books_data, how="left", left_on="ISBN", right_index=True)
# print(rating_data)


"""books_data processing"""

# clear "Year-Of-Publication" column
books_data["Year-Of-Publication"] = books_data["Year-Of-Publication"].fillna(0)
books_data["Year-Of-Publication"] = books_data["Year-Of-Publication"]\
    .apply(lambda x: int(x) if len(re.findall(r"[^a-z]", f"{x}".lower())) == 4 else 0).astype("int64")


# rename "ISBN column to ISBN
books_data = books_data.rename(columns={'"ISBN': 'ISBN'})

# delete excess first symbol
books_data["ISBN"] = books_data["ISBN"].apply(lambda x: str(x)[1:])

# add indexes as books numbers column
books_data["ISBN-Encoded"] = books_data.index.astype("int64")


# fill authors NaNs
books_data["Book-Author"] = books_data["Book-Author"].fillna("None")

# create dict and encode all book authors to numbers
unique_authors = books_data["Book-Author"].unique()
author_to_number = {author: number for number, author in enumerate(unique_authors)}
number_to_author = {number: author for number, author in enumerate(unique_authors)}
del unique_authors

# add encoded authors as column
books_data["Book-Author-Encoded"] = books_data["Book-Author"].apply(lambda x: int(author_to_number[x])).astype("int64")
# print(books_data["Book-Author-Encoded"])


# fill Publisher NaNs
books_data["Publisher"] = books_data["Publisher"].fillna("None")

# create dict and encode all book Publishers to numbers
unique_publisher = books_data["Publisher"].unique()
publisher_to_number = {publisher: number for number, publisher in enumerate(unique_publisher)}
number_to_publisher = {number: publisher for number, publisher in enumerate(unique_publisher)}

# add encoded publishers as column
books_data["Publisher-Encoded"] = books_data["Publisher"].apply(lambda x: int(publisher_to_number[x])).astype("int64")
# print(books_data["Publisher-Encoded"])


# fill Book-Title NaNs
books_data["Book-Title"] = books_data["Book-Title"].fillna("None")

# process words in titles
books_data["Book-Title-Encoded"] = books_data["Book-Title"].apply(lambda x: re.sub(r"[^a-z ]", "", x.lower()).strip())

# create dicts for symbols encoding
symbol_to_number = {symbol: number+1 for number, symbol in enumerate(string.ascii_lowercase+" ")}
number_to_symbol = {number+1: symbol for number, symbol in enumerate(string.ascii_lowercase+" ")}

# encode all titles on chars level
books_data["Book-Title-Encoded"] = books_data["Book-Title-Encoded"].apply(lambda x: list(symbol_to_number[char]
                                                                                         for char in str(x)))
# extend them to one len
books_data["Book-Title-Encoded"] = list(utils.pad_sequences(sequences=books_data["Book-Title-Encoded"], value=0))
# print(books_data["Book-Title-Encoded"])


# show all processed books_data in one
# print(books_data[["Year-Of-Publication", "ISBN-Encoded", "Book-Author-Encoded", "Publisher-Encoded", "Book-Title-Encoded"]])
# print(books_data[["Year-Of-Publication", "ISBN-Encoded", "Book-Author-Encoded", "Publisher-Encoded", "Book-Title-Encoded"]].describe())


"""user_data processing"""

# fill NaNs in Age column with 0
user_data["Age"] = user_data["Age"].fillna(0)

# standardize and scale age in renge (0., 1.)
user_data["Age-Encoded"] = preprocessing.MinMaxScaler().fit_transform(np.asarray(user_data["Age"]).reshape(-1, 1)).reshape(-1,)
# print(user_data["Age-Encoded"])


# fill NaNs in Location column
user_data["Location"] = user_data["Location"].fillna("n/a")

# split location string by ", " and reshape to (-1,)
user_data["Location-Encoded"] = user_data["Location"].apply(lambda x: re.split(", ", x.lower().strip()))
# flatten data to 1-d array
user_data_unique_locations = np.asarray([element for sublist in user_data["Location-Encoded"] for element in sublist])
# take all unique locations
user_data_unique_locations = np.unique(user_data_unique_locations.reshape(-1,).astype("str"), axis=-1)
# add blank value to list as first element
user_data_unique_locations = np.append(user_data_unique_locations, ["blank_value"], axis=-1)[::-1]

# create dicts for users locations
location_to_number = {location: number for number, location in enumerate(user_data_unique_locations)}
number_to_location = {number: location for number, location in enumerate(user_data_unique_locations)}
# del user_data_unique_locations

# encode all users locations
user_data["Location-Encoded"] = user_data["Location-Encoded"].apply(lambda x: list(location_to_number[element]
                                                                                   for element in x))
# extend all location sequences to one len
user_data["Location-Encoded"] = list(utils.pad_sequences(sequences=user_data["Location-Encoded"],
                                                         value=location_to_number["blank_value"]))


# show all processed user_data in one
# print(user_data[["Age-Encoded", "Location-Encoded"]])
# print(user_data[["Age-Encoded", "Location-Encoded"]].describe())


"""rating_data processing and merging it all together"""

# standardize and scale books ratings in range (0.5, 1)
rating_data["Book-Rating-Encoded"] = preprocessing.MinMaxScaler(feature_range=(0., 1.))\
    .fit_transform(np.asarray(rating_data["Book-Rating"]).reshape(-1, 1)).reshape(-1,)
# print(rating_data["Book-Rating-Encoded"])


# re-give type to all columns
rating_data["ISBN"], books_data["ISBN"] = rating_data["ISBN"].astype("str"), books_data["ISBN"].astype("str")
rating_data["User-ID"], rating_data["Book-Rating"] = rating_data["User-ID"].astype("int64"), rating_data["Book-Rating"].astype("float64")
rating_data["Book-Rating-Encoded"] = rating_data["Book-Rating-Encoded"].astype("float64")
user_data["User-ID"], user_data["Location"] = user_data["User-ID"].astype("int64"), user_data["Location"].astype("str")
user_data["Age"], user_data["Age-Encoded"] = user_data["Age"].astype("float64"), user_data["Age-Encoded"].astype("float64")
user_data["Location-Encoded"] = user_data["Location-Encoded"].astype("object")
books_data["Book-Title"], books_data["Book-Author"] = books_data["Book-Title"].astype("str"), books_data["Book-Author"].astype("str")
books_data["Publisher"] = books_data["Publisher"].astype("str")

# merge all dataframes to one by rating_data
rating_data = pd.merge(left=rating_data, right=user_data, how="left", on="User-ID")
rating_data = pd.merge(left=rating_data, right=books_data, how="left", on="ISBN")

# delete irrelevant dataframes
del user_data, books_data

# delete all rows with NaNs
rating_data = rating_data.drop(rating_data[rating_data.isnull().any(axis=1)].index)



In [None]:
"""final data"""

# take only rows with rating >= 0.1
rating_data = rating_data.loc[rating_data['Book-Rating-Encoded'] >= 0.1]

# take only rows where products have >= 20 reviews
rating_data = rating_data.groupby("ISBN-Encoded").filter(lambda x: len(x) >= 20)
# # take only rows with users who make >= 20 reviews
rating_data = rating_data.groupby("User-ID").filter(lambda x: len(x) >= 20)

# # take part of data with only threshold_value users and threshold_value books
# threshold_value = 50000
# rating_data = rating_data.loc[rating_data['ISBN-Encoded'] <= threshold_value]
# rating_data = rating_data.loc[rating_data['User-ID'] <= threshold_value]


# shuffle all rows in dataframe
rating_data = rating_data.sample(frac=1).reset_index(drop=True)

# show final full dataframe
print(rating_data)
print(rating_data.describe())

# # take and show only useful for work of model data
# useful_data = rating_data[["User-ID", "ISBN-Encoded", "Book-Rating-Encoded", "Age-Encoded", "Location-Encoded",
#                            "Year-Of-Publication", "Book-Author-Encoded", "Publisher-Encoded", "Book-Title-Encoded"]]
# print(useful_data)




In [None]:
"""surprise SVD model"""

# prepare train and test sets
reader = Reader(rating_scale=(0., 1.))
data = Dataset.load_from_df(rating_data[["User-ID", "ISBN-Encoded", "Book-Rating-Encoded"]], reader)
train, test = train_test_split(data, test_size=0.2)

# init and train the funk mf model
algo = SVD(n_factors=100, n_epochs=20)
algo.fit(train)
pred = algo.test(test)

# evaluation the test set
model_accuracy = accuracy.rmse(pred)
print(f"model accuracy: {model_accuracy}")

# compare of real and predicted rating of test dataset
for c1, prediction in enumerate(pred):
    if c1 < 100:
        print(f"prediction {prediction}")
        print(f"real {test[c1]}")
