In [121]:
import polars as pol

In [122]:
with pol.Config() as cfg:
    cfg.set_tbl_cols(5)
    cfg.set_tbl_rows(5)

# Load parquet files

In [123]:
df_metadata = pol.read_parquet("../data/raw/metadata1.parquet", 
                      columns=["asin","brand","category", "date", "main_cat", "price","rank", "title", "image"], 
                      n_rows=50)

# Functions

In [138]:
def save_file(df, t):
    p = str.format("../data/processed/{}.json", t)
    df.write_json(p)

# EDA

In [125]:
print(df_metadata.shape)
print(type(df_metadata))

(50, 9)
<class 'polars.dataframe.frame.DataFrame'>


In [126]:
df_metadata.sample(5)

asin,brand,category,date,main_cat,price,rank,title,image
str,str,list[str],str,str,str,str,str,list[str]
"""7293005946""","""BSK Eyewear""","[""Clothing, Shoes & Jewelry"", ""Women"", … ""Sunglasses""]","""""","""Sports & Outdo…","""""","""4,044,815 in S…","""BSK Eyewear Un…","[""https://images-na.ssl-images-amazon.com/images/I/3176Crl-QuL._SS40_.jpg""]"
"""6319781610""","""Alion""","[""Clothing, Shoes & Jewelry"", ""Women"", … ""US Medium=China X-Large:Length:70.47""(179cm),Bust:38.58""(98cm),Waist:33.86""(86cm),Hip:42.13""(107cm)""]","""<div class=""a-…","""<img src=""http…","""$24.88 - $26.8…","""14,151,913 in …","""Alion Women Cl…","[""https://images-na.ssl-images-amazon.com/images/I/41uCeNnYHeL._SR38,50_.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/61Yf6nfUAyL._SR38,50_.jpg""]"
"""6342506256""","""Gaok""","[""Clothing, Shoes & Jewelry"", ""Men"", … ""For detailed size information from PRODUCT DESCRIPTION before ordering and choose fits size""]","""<div class=""a-…","""<img src=""http…","""$26.99""","""2,877,972 in C…","""Gaok Men's Ret…","[""https://images-na.ssl-images-amazon.com/images/I/61yKRB3EJIL._SR38,50_.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/615X5QZ99ML._SR38,50_.jpg"", … ""https://images-na.ssl-images-amazon.com/images/I/41VCKEGXC3L._SX38_SY50_CR,0,0,38,50_.jpg""]"
"""634252209X""","""OLO""","[""Clothing, Shoes & Jewelry"", ""Women"", … ""Sunglasses""]","""""","""Sports & Outdo…","""$3.29""","""936,295 in Spo…","""Crazy Explosio…","[""https://images-na.ssl-images-amazon.com/images/I/31HV7SFXkhL._SS40_.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/41ot2OsJ8TL._SS40_.jpg"", … ""https://images-na.ssl-images-amazon.com/images/I/41hN2%2BUjIyL._SS36_.jpg""]"
"""7482782788""","""Gifts by Lulee…","[""Clothing, Shoes & Jewelry"", ""Men"", … ""plus another 18 inches extra""]","""Gifts by Lulee…","""<img src=""http…","""$10.99""","""6,425,160 in C…","""Unisex Genuine…","[""https://images-na.ssl-images-amazon.com/images/I/61wjN%2BdpnEL._US40_.jpg""]"


# Preprocessing and feature generation

In [127]:
# Primero se cuentan cuantas imagenes tiene el producto y la cantidad de categorias a las que esta asociado
# luego se eliminan las columnas IMAGE y CATEGORY

df_metadata = df_metadata.with_columns(( pol.col("image").list.lengths()).alias("cant_image")).drop("image")
df_metadata = df_metadata.with_columns(( pol.col("category").list.lengths()).alias("cant_category")).drop("category")
df_metadata.sample(2)

asin,brand,date,main_cat,price,rank,title,cant_image,cant_category
str,str,str,str,str,str,str,u32,u32
"""6342520577""","""Y-BIN""","""<div class=""a-…","""<img src=""http…","""$7.99""","""3,644,131 in C…","""Y-BIN Women's …",3,9
"""7293005946""","""BSK Eyewear""","""""","""Sports & Outdo…","""""","""4,044,815 in S…","""BSK Eyewear Un…",1,5


In [128]:
# de la columna DATE se extrae la unica info que me parece util y se borra todo el HTML existente, la info util
# es basicamente un ranking algo como 5 star 4 star 3 star y si consecutivamente
df_metadata = df_metadata.with_columns(( pol.col("date").str.extract(r"[0-5]", 0))
                                       .alias("cant_star_from_date")) #.drop("date")
df_metadata.sample(3)


asin,brand,date,main_cat,price,rank,title,cant_image,cant_category,cant_star_from_date
str,str,str,str,str,str,str,u32,u32,str
"""6342502315""","""Crazy""","""<div class=""a-…","""<img src=""http…","""$0.50""","""273,519 in Clo…","""Crazy Women's …",11,8,"""2"""
"""7293005946""","""BSK Eyewear""","""""","""Sports & Outdo…","""""","""4,044,815 in S…","""BSK Eyewear Un…",1,5,
"""6342523002""","""FQQ""","""<div class=""a-…","""<img src=""http…","""$4.50""","""1,761,440 in C…","""FQQ Women's Se…",3,12,"""2"""


In [129]:
# en la columna MAIN_CAT se elimina todo el HTML existente ya que como tal no dice nada acerca de la categoria a la que pertenece
df_metadata = df_metadata.with_columns(( pol.col("main_cat").str.replace(r"^<.*$", "") )
                                       .alias("main_category_clean")) #.drop("main_cat")
df_metadata.sample(3)


# si la columna main_cat lo que tiene e HTML basicamente lo cambia por un espacio en blanco ya que como tal no hay categoria 
# en ese caso habria que ver de donde se obtiene esa categoria para ser usada
# o
# si vamos a centrarnos solo en una sola categoria este problema se evitaria por completo ya que 
# todos los records deberian venir con ese campo válido

asin,brand,date,main_cat,price,rank,title,cant_image,cant_category,cant_star_from_date,main_category_clean
str,str,str,str,str,str,str,u32,u32,str,str
"""6319781610""","""Alion""","""<div class=""a-…","""<img src=""http…","""$24.88 - $26.8…","""14,151,913 in …","""Alion Women Cl…",2,9,"""2""",""""""
"""6976380817""","""Suncolor8""","""<div class=""a-…","""<img src=""http…","""$6.36 - $8.66""","""[]""","""Suncolor8 Wome…",4,10,"""2""",""""""
"""6342510598""","""MengK""","""<div class=""a-…","""<img src=""http…","""""","""11,233,196 in …","""MENGK Womens D…",2,4,"""2""",""""""


In [164]:
#extraer el ranking de la columna rank
df_metadata = df_metadata.with_columns(( pol.col("rank").str.extract(r"[0-9,@._-]+", 0))
                                       .alias("clean_rank")) #.drop("main_cat")
df_metadata.sample(3)

asin,brand,date,main_cat,price,rank,title,cant_image,cant_category,cant_star_from_date,main_category_clean,clean_rank
str,str,str,str,str,str,str,u32,u32,str,str,str
"""7304678828""","""Jdress""","""<div class=""a-…","""<img src=""http…","""$89.99""","""5,376,665 in C…","""Jdress Women's…",18,9,"""2""","""""","""5,376,665"""
"""6665559769""","""Honchosfx""","""<div class=""a-…","""<img src=""http…","""$21.74""","""598,919 in Clo…","""Honchosfx Mens…",2,13,"""2""","""""","""598,919"""
"""6338750458""","""F ARMAF""","""""","""Amazon Home""","""$29.74""","""["">#236,120 in…","""Armaf Club De …",0,7,,"""Amazon Home""","""236,120"""


# Save temp file

In [139]:
#save_file(df_metadata, 'tmp_metadata')