In [7]:
import gzip, json, os
import pandas as pd
from freeplot.utils import export_pickle

In [8]:
path = "../RecSets/Amazon2023/Office/"
inter_file = "Office_Products.jsonl.gz" # file of review data
meta_file = "meta_Office_Products.jsonl.gz" # file of meta data
dataset = "Amazon2023_Office"

inter_keys = ['user_id', 'asin', 'rating', 'timestamp', 'parent_asin']
inter_cols = ['user_id:token', 'item_id:token', 'rating:float', 'timestamp:float', 'parent_asin:token']
assert len(inter_keys) == len(inter_cols)

meta_keys = ['parent_asin', 'title', 'features', 'description', 'price', 'images', 'videos', 'details']
meta_cols = ['item_id:token', 'parent_asin:token', 'title:token', 'features:token', 'description:token', 'price:float', 'images:token', 'videos:token', 'details:token']
assert len(meta_keys) == len(meta_cols) - 1

In [9]:
def open_and_read(path, file_):
    data = []
    for line in gzip.open(os.path.join(path, file_)):
        data.append(json.loads(line.strip()))
    return data

In [10]:
raw_inter = open_and_read(path, inter_file)
raw_inter[0]

{'rating': 5.0,
 'title': 'Pretty & I love it!',
 'text': 'Lovely ink. Writes well. The right amount of wet/dry. Currently using it in both a platinum curidas (f) & a pelican twist (m) & it writes well in both.  Beautiful color & flows nicely. It has not clogged up my nibs.',
 'images': [],
 'asin': 'B01AHHL4X2',
 'parent_asin': 'B01MZ3SD2X',
 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ',
 'timestamp': 1677939345945,
 'helpful_vote': 0,
 'verified_purchase': True}

In [11]:
raw_meta = open_and_read(path, meta_file)
raw_meta[0]

{'main_category': 'Office Products',
 'title': 'Alliance Rubber 07706 Non-Latex Brites File Bands, Colored Elastic Bands, 1.5 oz Pic Pac Dispenser (Assorted Bright Colors and Sizes)',
 'average_rating': 4.5,
 'rating_number': 665,
 'features': ['REUSABLE: These colored rubber bands are stretchable and reusable meaning they can be repurposed for a wide range of different projects and jobs.',
  'EASY STRETCH BANDS: With their easy stretch design, these non-latex rubber bands are easy to manipulate to secure files and paperwork. This can help in reducing symptoms related to carpal tunnel syndrome from repetitive strain.',
  'THREE BRITE SHADES: This Pic Pac dispenser contains rubber bands in a variety of bright colors and sizes, making identification and filing quick and easy.',
  'FILE ORGANIZATION: File rubber bands are a great tool for businesses and the home office, helping you keep your paperwork, folders, files, and print-outs color-coded and organized.',
  'NON-LATEX RUBBER: These 

In [12]:

# make interaction dataframe 

inters = []

for interaction in raw_inter:
    inters.append(
        [interaction[key] for key in inter_keys]
    )

inter_df = pd.DataFrame(
    inters, columns=inter_cols
)

inter_df

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float,parent_asin:token
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B01AHHL4X2,5.0,1677939345945,B01MZ3SD2X
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B08L6H23JZ,4.0,1677939160682,B08L6H23JZ
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B07JDZ5J46,1.0,1660188831933,B07JDZ5J46
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B004MNX7EW,4.0,1659806066713,B07BR2PBJN
4,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,B019YLRFFS,3.0,1659799390978,B097SFY5ZS
...,...,...,...,...,...
12845707,AFD7M22ZYOV6HZ6GNVZZQUZH4R4A,B0030INLF0,1.0,1447080556000,B0030INLF0
12845708,AGYDA2QY4QVZB5TR6TUO2PVXWORQ,B07HT28KGD,3.0,1617820460972,B07HT28KGD
12845709,AHOKC2PQ4PQ3CPVLLMMIVTM5DJGQ,B0030INLF0,4.0,1339179148000,B0030INLF0
12845710,AFGGT5RI67G5TYXKVFOWBZ4DD2JA,B01GJGC2OK,1.0,1677560507495,B0C4FWH575


In [13]:

# make item dataframe

meta_data = dict()

for item in raw_meta:
    meta_data[item['parent_asin']] = item

items = []

pairs = set(tuple(pair) for pair in inter_df[['item_id:token', 'parent_asin:token']].values.tolist())
for (item_id, parent_id) in pairs:
    items.append(
        [
            item_id, 
            *[meta_data[parent_id][key] for key in meta_keys]
        ]
    )

item_df = pd.DataFrame(
    items,
    columns=meta_cols
)

item_df

Unnamed: 0,item_id:token,parent_asin:token,features:token,description:token,price:float,images:token,videos:token,details:token
0,B008EMXABM,B008EMXABM,"[Chrome plated metal keychain, Double-sided 3D...",[This beautiful keychain is chrome plated and ...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],"{'Manufacturer': 'Nothing Specific, Inc.', 'Br..."
1,B0050BK7E8,B07C3D2CH8,"[Staples 160 - 220 sheets, Made from very stro...",[Rapid's Heavy Duty staples are made using ver...,9.87,[{'thumb': 'https://m.media-amazon.com/images/...,[],"{'Manufacturer': 'Esselte Corporation', 'Brand..."
2,B09225DH2W,B09223GK8M,"[PREMIUM MATERIALS: Creative design, elegant a...",[],10.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'KLIONGO Pen Holder', 'url': 'https...","{'Specific Uses For Product': 'Cloth,Pen Holde..."
3,B07FT76RW5,B07H3LTD3W,[Package includes 24 cards and 24 white envelo...,[Celebrate the season and stay in touch with c...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],{'Product Dimensions': '5.13 x 1.75 x 9.5 inch...
4,B07FFVPDTJ,B07FFVPDTJ,[Included:1 Pack DR730 DR 730 DR-730 Drum Unit...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Aztech Compatible Drum Unit Replac...,"{'Manufacturer': 'Brother', 'Brand': 'Aztech',..."
...,...,...,...,...,...,...,...,...
906044,B07Z4TH84R,B07Z4TH84R,[★【 Strong Viscosity】- Through new NanoTechnol...,"[Feature of this nano magic tape:, 1. Nano-pu ...",,[{'thumb': 'https://m.media-amazon.com/images/...,[],"{'Manufacturer': 'Ddct', 'Brand': 'Ddct', 'Ite..."
906045,B0937FF515,B0937FF515,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'HASFINE Retractable Badge Holder R...,"{'Brand': 'BHROSE', 'Item Weight': '0.317 ounc..."
906046,B0773HZBGR,B077NDWJ7D,[Set contains 20 x Zebra Sarasa Clip 0.3 Retra...,[Zebra Sarasa Clip 0.3 Retractable Gel Ink Pen...,32.23,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Pentel EnerGel RTX Medium point pe...,"{'Manufacturer': 'Stationery JP', 'Brand': 'St..."
906047,B00D6BC7R4,B00D6BC7R4,"[Soft, black lizard-like cover with twin-wire ...","[Daily Planner, 11"" x 8-1/2"". Soft, black liza...",,[{'thumb': 'https://m.media-amazon.com/images/...,[],"{'Manufacturer': 'REDIFORM OFFICE PRODUCTS', '..."


In [14]:

inter_df.to_csv(os.path.join(path, f"{dataset}.inter"), index=False, sep='\t')
item_df.to_csv(os.path.join(path, f"{dataset}.item"), index=False, sep='\t')