In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from scipy import stats
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

import ast
from sqlalchemy import create_engine, text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from fuzzywuzzy import process, fuzz
from IPython.display import clear_output
import pickle
# from data_db import user, mdp
import time

In [3]:
book_data = pd.read_csv("Dataset/books_metadata_Amazon.csv", delimiter=',', on_bad_lines='skip')
book_data = book_data[['Title', 'authors', 'categories']] # keep the Title, authors and categories from the columns

In [4]:
# Function to convert a list into a string
def list_to_string(list):
    return ', '.join(list)

def str_to_list(list_str):
    if isinstance(list_str, str):
        return ast.literal_eval(list_str)
    else:
        return []

# Replace NaN with empty strings
book_data['authors'].fillna('[]', inplace=True)
book_data['categories'].fillna('[]', inplace=True)

# Convert strings that look like lists into actual lists
book_data['authors'] = book_data['authors'].apply(str_to_list)
book_data['categories'] = book_data['categories'].apply(str_to_list)

# Convert the lists into strings
book_data['authors'] = book_data['authors'].apply(list_to_string)
book_data['categories'] = book_data['categories'].apply(list_to_string)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  book_data['authors'].fillna('[]', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  book_data['categories'].fillna('[]', inplace=True)


In [7]:
# consider on user_id, book_id, and ratings
df = pd.read_csv('Dataset/Books_Amazon.csv')
df = df[['Id','User_id','Title','review/score', 'review/time']]
df.rename(columns={'Id':'ProductId','User_id':'UserId','review/time':'Time','Title':'title','review/score':'Score'},inplace=True)

In [10]:
# FUSION DES DONNEES
df['title'] = df['title'].str.strip().str.lower()
book_data['Title'] = book_data['Title'].str.strip().str.lower()

# Merge the DataFrames on the titles
df = df.merge(book_data, how='left', left_on='title', right_on='Title')

# Supprimez la colonne des titres en double
df = df.drop(columns=['Title'])


In [11]:
#lots of fields where user is NaN
df = df.dropna(subset=['UserId'])

In [12]:
print("Size of 'ProductId' column:", len(df['ProductId']))
print("Size of 'UserId' column:", len(df['UserId']))

# Define the threshold values
product_id_threshold = 200 
user_id_threshold = 10

# Count the occurrences of ProductId and UserId
product_id_counts = df['ProductId'].value_counts()
user_id_counts = df['UserId'].value_counts()

# Filter out rows below the threshold
filtered_df = df[(df['ProductId'].isin(product_id_counts[product_id_counts >= product_id_threshold].index)) &
                 (df['UserId'].isin(user_id_counts[user_id_counts >= user_id_threshold].index))]

print("Size of 'ProductId' column:", len(filtered_df['ProductId']))
print("Size of 'UserId' column:", len(filtered_df['UserId']))

Size of 'ProductId' column: 2687258
Size of 'UserId' column: 2687258
Size of 'ProductId' column: 534135
Size of 'UserId' column: 534135


In [13]:
filtered_df.to_csv('Book_Dataset.csv', index=False)

In [14]:
filtered_df

Unnamed: 0,ProductId,UserId,title,Score,Time,authors,categories
1235,B0007H4QBK,AF3X7J0XC391L,economics in one lesson,5.0,1106265600,Henry Hazlitt,Business & Economics
1237,B0007H4QBK,A3FHSO1SKHU378,economics in one lesson,5.0,1255996800,Henry Hazlitt,Business & Economics
1244,B0007H4QBK,A3OWUSU9RG4NMF,economics in one lesson,5.0,1249344000,Henry Hazlitt,Business & Economics
1248,B0007H4QBK,ATMOVG6SV5D2B,economics in one lesson,4.0,1240185600,Henry Hazlitt,Business & Economics
1257,B0007H4QBK,A16QJ649N8PRV,economics in one lesson,5.0,1118188800,Henry Hazlitt,Business & Economics
...,...,...,...,...,...,...,...
3320075,B000P91JYW,A1O4UBZ7G4ID65,jane eyre,2.0,1044921600,Charlotte Brontë,Literary Criticism
3320077,B000P91JYW,A2JPT2LH3AVA1A,jane eyre,5.0,1029283200,Charlotte Brontë,Literary Criticism
3320078,B000P91JYW,A2LWE21EAEHMG4,jane eyre,3.0,992649600,Charlotte Brontë,Literary Criticism
3320081,B000P91JYW,A1H59JRV8YR1EU,jane eyre,1.0,1021852800,Charlotte Brontë,Literary Criticism


In [16]:
import pandas as pd

test = pd.read_csv('Book_Dataset.csv')
test


Unnamed: 0,ProductId,UserId,title,Score,Time,authors,categories
0,B0007H4QBK,AF3X7J0XC391L,economics in one lesson,5.0,1106265600,Henry Hazlitt,Business & Economics
1,B0007H4QBK,A3FHSO1SKHU378,economics in one lesson,5.0,1255996800,Henry Hazlitt,Business & Economics
2,B0007H4QBK,A3OWUSU9RG4NMF,economics in one lesson,5.0,1249344000,Henry Hazlitt,Business & Economics
3,B0007H4QBK,ATMOVG6SV5D2B,economics in one lesson,4.0,1240185600,Henry Hazlitt,Business & Economics
4,B0007H4QBK,A16QJ649N8PRV,economics in one lesson,5.0,1118188800,Henry Hazlitt,Business & Economics
...,...,...,...,...,...,...,...
534130,B000P91JYW,A1O4UBZ7G4ID65,jane eyre,2.0,1044921600,Charlotte Brontë,Literary Criticism
534131,B000P91JYW,A2JPT2LH3AVA1A,jane eyre,5.0,1029283200,Charlotte Brontë,Literary Criticism
534132,B000P91JYW,A2LWE21EAEHMG4,jane eyre,3.0,992649600,Charlotte Brontë,Literary Criticism
534133,B000P91JYW,A1H59JRV8YR1EU,jane eyre,1.0,1021852800,Charlotte Brontë,Literary Criticism
