In [111]:
import os
import re
import time
import string
import polars as pl

<div style="border-radius:10px; border:#DEB887 solid; padding: 15px; background-color: #FFFAF0; font-size:100%; text-align:left">
<h3 align="left"><font color='#DEB887'>Info</font></h3>
    
This notebook allows you to concatenate and filter the data necessary to train the Machine Learning model for the Recommendation System

# List all reviews parquet files related to reviews to be loaded

In [112]:
path = '../data/raw/reviews'
with os.scandir(path) as reviewFiles:
    reviewFiles = [file.name for file in reviewFiles if file.is_file()]
print(reviewFiles)

['processed_reviews_books_1.parquet', 'processed_reviews_books_10.parquet', 'processed_reviews_books_11.parquet', 'processed_reviews_books_12.parquet', 'processed_reviews_books_13.parquet', 'processed_reviews_books_3.parquet', 'processed_reviews_books_4.parquet', 'processed_reviews_books_5.parquet', 'processed_reviews_books_6.parquet', 'processed_reviews_books_7.parquet', 'processed_reviews_books_8.parquet', 'processed_reviews_books_9.parquet']


# Load and join all parquet files in the directory

In [113]:
df = pl.DataFrame()
for f in reviewFiles:
    n = str.format("../data/raw/reviews/{}", f)
    print("reading " + n)

    df_aux = pl.read_parquet(n)

    print("Adding..." + n)
    df = df.vstack(df_aux)
    n = ""


reading ../data/raw/reviews/processed_reviews_books_1.parquet
Adding...../data/raw/reviews/processed_reviews_books_1.parquet
reading ../data/raw/reviews/processed_reviews_books_10.parquet
Adding...../data/raw/reviews/processed_reviews_books_10.parquet
reading ../data/raw/reviews/processed_reviews_books_11.parquet
Adding...../data/raw/reviews/processed_reviews_books_11.parquet
reading ../data/raw/reviews/processed_reviews_books_12.parquet
Adding...../data/raw/reviews/processed_reviews_books_12.parquet
reading ../data/raw/reviews/processed_reviews_books_13.parquet
Adding...../data/raw/reviews/processed_reviews_books_13.parquet
reading ../data/raw/reviews/processed_reviews_books_3.parquet
Adding...../data/raw/reviews/processed_reviews_books_3.parquet
reading ../data/raw/reviews/processed_reviews_books_4.parquet
Adding...../data/raw/reviews/processed_reviews_books_4.parquet
reading ../data/raw/reviews/processed_reviews_books_5.parquet
Adding...../data/raw/reviews/processed_reviews_books_5.

In [114]:
df.sample(1)

asin,overall,reviewText,reviewerID,summary,verified,unixReviewTime
str,str,str,str,str,str,datetime[μs]
"""0425252868""","""5.0""","""It is a fabulo…","""A2DCNYQZOC0M1B…","""I loved it and…","""false""",2017-01-19 00:00:00


In [115]:
df.shape

(2620482, 7)

In [116]:
df.null_count()

asin,overall,reviewText,reviewerID,summary,verified,unixReviewTime
u32,u32,u32,u32,u32,u32,u32
0,0,1260,0,796,0,0


In [117]:
df = df.drop_nulls()

In [118]:
df.shape

(2618468, 7)

Save it as temporal parquet file

In [119]:
df.write_parquet("../data/processed/All_reviews.parquet")

In [120]:
reviews = pl.read_parquet("../data/processed/All_reviews.parquet")

Load the metadata related to books category

In [121]:
metadata = pl.read_parquet("../data/raw/all_metadata_books_in_top25k.parquet")

In [122]:
reviews.shape

(2618468, 7)

In [123]:
metadata.shape

(12049, 9)

In [124]:
reviews.sample(1)

asin,overall,reviewText,reviewerID,summary,verified,unixReviewTime
str,str,str,str,str,str,datetime[μs]
"""0670922854""","""4.0""","""It took 3 week…","""A3I8GRDBFX8R25…","""Shipper asleep…","""true""",2014-11-23 00:00:00


In [125]:
metadata.sample(1)

asin,brand,price,title,cant_image,cant_category,rank_in_category,also_buy_count,also_view_count
str,str,f64,str,u32,u32,i64,u32,u32
"""1449462308""","""Visit Amazon's…",6.9,"""Big Nate: A Go…",0,3,4235,98,59


Join 2 dataframes by the ASIN number

In [126]:
join_keys = ["asin"]
df_joined = metadata.join(reviews, join_keys)

In [127]:
df_joined.shape

(2618468, 15)

In [128]:
df_joined.sample(1)

asin,brand,price,title,cant_image,cant_category,rank_in_category,also_buy_count,also_view_count,overall,reviewText,reviewerID,summary,verified,unixReviewTime
str,str,f64,str,u32,u32,i64,u32,u32,str,str,str,str,str,datetime[μs]
"""1579654355""","""Visit Amazon's…",34.99,"""Bouchon Bakery…",0,3,8292,97,60,"""5.0""","""Bought this fo…","""ATFMFB0PCGM0B""","""Gem""","""true""",2013-11-21 00:00:00


In [129]:
final = df_joined.select("title", "reviewerID", "unixReviewTime")


In [130]:
final.sample(1)

title,reviewerID,unixReviewTime
str,str,datetime[μs]
"""Dragons of Spr…","""A2EB2V88K2SRU""",2017-04-22 00:00:00


In [131]:
final.shape
#2916355

(2618468, 3)

In [132]:
final = final.unique()

In [133]:
final.shape

(2612701, 3)

In [134]:
final = final.with_columns(
    pl.col("unixReviewTime").cast(pl.Int64).alias("reviewTime")
).drop("unixReviewTime")  

In [135]:
final

title,reviewerID,reviewTime
str,str,i64
"""Boys Adrift: T…","""A245831QTLS0K3…",1196726400000000
"""Boys Adrift: T…","""A3PN9EB0B1QBY4…",1196208000000000
"""Boys Adrift: T…","""AU76601D8U0CD""",1191628800000000
"""Why Are All th…","""A3V0N3ORMPV7BC…",1516320000000000
"""Why Are All th…","""A3B6J2XHJMJ59S…",1515542400000000
"""Why Are All th…","""A10LL2QWVPCH60…",1515110400000000
"""Why Are All th…","""A4N8U31DLAUU8""",1505433600000000
"""Why Are All th…","""A28WYES84Z2PFQ…",1504915200000000
"""Why Are All th…","""A1ECC13YLXYNK4…",1503705600000000
"""Why Are All th…","""A285RQFOK8APPQ…",1495843200000000


Save the file to be used

In [136]:
final.write_parquet("../data/processed/Final_data_for_ML.parquet")