# Preprocessing Amazon2023 for Item Context Information

In [9]:
from typing import List

import os
from freerec.data.tags import USER, ITEM, RATING, TIMESTAMP
from freerec.data.preprocessing.amazon2023 import extract_from_amazon2023

In [10]:
dataset: str = "Beauty"
filedir: str = f"../data/Amazon2023/{dataset}"
image_size: str = 'large' # (thumb, large, vis)
inter_file = os.path.join(filedir, f"{dataset}.inter")
item_file = os.path.join(filedir, f"{dataset}.item")

In [11]:
inter_df, item_df = extract_from_amazon2023(filedir)
inter_df.head(5), item_df.head(5)

(                           USER        ITEM  RATING      TIMESTAMP parent_asin
 0  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B00Z03RC80     1.0  1616743454733  B00Z03RC80
 1  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B085PRT2MP     1.0  1614915977684  B085PRT2MP
 2  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B08G81QQ9L     5.0  1612052493701  B08G81QQ9L
 3  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B07YYG76X1     1.0  1609700981786  B07YYG76X1
 4  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B0761M33BX     3.0  1581313195358  B07X4FKLNK,
          ITEM parent_asin                                              title  \
 0  B00Z03RC80  B00Z03RC80  Terra Tattoos Tropical Hawaiian Metallic Tatto...   
 1  B085PRT2MP  B085PRT2MP  Spray Bottle,Fine Mist Mini Clear 60ml/2oz Spr...   
 2  B08G81QQ9L  B08G81QQ9L  Diamond Painting Animals DIY 5D Number Kits fo...   
 3  B07YYG76X1  B07YYG76X1  ATMOKO Electric Toothbrushes for Adults 3 Soni...   
 4  B0761M33BX  B07X4FKLNK  Teamkio 18pcs Manicure Set Pedicure Nail Clipp...   
 
                               

In [12]:

# text converter that merges texts for each field
def text_converter(texts):
    if isinstance(texts, str):
        merged = texts
    elif isinstance(texts, List):
        merged = ' '.join([f"{i}. {text} " for i, text in enumerate(texts, start=1)])
    elif isinstance(texts, dict):
        merged = ' '.join([f"{key}: {text} " for key, text in texts.items()])
    return merged.strip()

# image url converter that selects one of image urls
def image_url_converter(image_urls: List, image_size: str = image_size):
    if len(image_urls) == 0:
        return ''
    else:
        return image_urls[0][image_size]

In [13]:
# convert text fields
for field in ('title', 'features', 'description'):
    item_df[field] = item_df[field].map(text_converter)

item_df['text'] = item_df.apply(
    lambda row: "\n".join([f"{field.title()}: {row[field]}" for field in ('title', 'features', 'description')]),
    axis=1
)

In [14]:
# convert image urls
item_df['image_url'] = item_df['image_urls'].map(image_url_converter)

In [15]:
# .inter: USER, ITEM, RATING, TIMESTAMP
# .item: ITEM, Text, Image URL
inter_df = inter_df[[USER.name, ITEM.name, RATING.name, TIMESTAMP.name]]
item_df = item_df[[ITEM.name, 'text', 'image_url']]
inter_df.head(5), item_df.head(5)

(                           USER        ITEM  RATING      TIMESTAMP
 0  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B00Z03RC80     1.0  1616743454733
 1  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B085PRT2MP     1.0  1614915977684
 2  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B08G81QQ9L     5.0  1612052493701
 3  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B07YYG76X1     1.0  1609700981786
 4  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  B0761M33BX     3.0  1581313195358,
          ITEM                                               text  \
 0  B00Z03RC80  Title: Terra Tattoos Tropical Hawaiian Metalli...   
 1  B085PRT2MP  Title: Spray Bottle,Fine Mist Mini Clear 60ml/...   
 2  B08G81QQ9L  Title: Diamond Painting Animals DIY 5D Number ...   
 3  B07YYG76X1  Title: ATMOKO Electric Toothbrushes for Adults...   
 4  B0761M33BX  Title: Teamkio 18pcs Manicure Set Pedicure Nai...   
 
                                            image_url  
 0  https://m.media-amazon.com/images/I/61PUNdBS5A...  
 1  https://m.media-amazon.com/images/I/41613ggx3Y...  
 2  htt

In [16]:
# Save
inter_df.to_csv(inter_file, sep='\t', index=False)
item_df.to_csv(item_file, sep='\t', index=False)

After preprocessing the dataset, please using the following command to prepare filtered and splitted dataset:

```bash
freerec make Amazon2023{dataset} --root ../data --filedir Amazon2023/{dataset} -ku 10 ki 10 -sp 4 --ratios 8,1,1 --splitting ROU
```