In [2]:
import json
import matplotlib.pyplot as plt
import nltk
from tqdm import tqdm
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/yijingyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/yijingyang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [24]:
with open("Baby_Products.jsonl", "r") as f:
    dataset = [json.loads(line) for line in f]

with open("meta_Baby_Products.jsonl", "r") as f:
    meta = [json.loads(line) for line in f]

In [25]:
def user_count(dataset):
    user_counts = {}
    for r in dataset:
        id = r["user_id"]
        if id not in user_counts:
            user_counts[id] = 1
        else:
            user_counts[id] += 1
    return user_counts

user_counts = user_count(dataset)

**Keep users that have the number of reviews between 5-10**

In [26]:
def keep_users_larger_between_5_10(dataset, user_counts):
    data = []
    for r in dataset:
        if user_counts[r["user_id"]] > 5 and user_counts[r["user_id"]] < 10:
            data.append(r)
    return data

data = keep_users_larger_between_5_10(dataset, user_counts)
print("total reviews:", len(data), "unique users:", len(user_count(data)))

total reviews: 614255 unique users: 87464


**Keep reviews that have at least 2 sentences.**

In [27]:
def keep_reviews_at_least_3_sentences(dataset):
    data = []
    for r in dataset:
        n_sent = len(nltk.sent_tokenize(r["text"]))
        if n_sent >= 3:
            data.append(r)
    return data
data = keep_reviews_at_least_3_sentences(data)
print("total reviews:", len(data), "unique users:", len(user_count(data)))

total reviews: 319430 unique users: 75493


- organize the dataset by users : {user_id: , reviews: [review1, review2, ...],}
- order reviews each user by timestamp

In [28]:
def asin_product_pairs(metadata):
    meta = {}
    for p in metadata:
        asin = p["parent_asin"]
        meta[asin] = p["title"]
    return meta

meta = asin_product_pairs(meta)

In [29]:
def gen_user_review_item_data(dataset, metadata):
    data = []
    user_id_table = {}
    user_idx = 0
    for r in tqdm(dataset):
        title = r["title"]
        rating = r["rating"]
        review = r["text"]
        user_id = r["user_id"]
        parent_asin = r["parent_asin"]
        product_name = metadata[parent_asin]
        timestamp = r["timestamp"]
        
        review_info = {"product": product_name, "rating": rating, 
                       "title": title, "review": review}
        if user_id not in user_id_table:
            line = {"user_id": user_id, "reviews": {timestamp: review_info}}
            data.append(line)
            user_id_table[user_id] = user_idx
            user_idx += 1
        else:
            idx = user_id_table[user_id]
            data[idx]["reviews"][timestamp] = review_info
    return data
reviews = gen_user_review_item_data(data, meta)

100%|██████████| 319430/319430 [00:00<00:00, 969663.88it/s]


In [32]:
reviews[55]

{'user_id': 'AF2NJDADH6SYHXVZEIUBNH3KC4DA',
 'reviews': {1662506618860: {'product': 'Graco TurboBooster Backless Booster Car Seat, Dinorama',
   'rating': 4.0,
   'title': 'Cute and Comfy',
   'review': 'I like the look and feel of this. My grandson loves it too. It comes with cup holders on each side that can be pushed in. The arm rest cover on the left keeps coming off though very easily. For traveling, I think it would be too bulky though.'},
  1597897208144: {'product': 'Baby Wipes, Pampers Aqua Pure Sensitive Water Baby Diaper Wipes, Hypoallergenic and Unscented, 56 Count (Pack of 12)',
   'rating': 4.0,
   'title': 'Valuable and Sensitive',
   'review': 'These were a gift for my grandson. He has sensitive skin and he could use these. Great value also.'},
  1597538493761: {'product': 'Smart Design Kids Pop Up Organizer with Animal Print - VentilAir Mesh Netting - for Toddlers, Baby Clothes, Plushies, and Toys - Home Organization - Cube - 10.5 x 11 Inch - Orange Tiger',
   'rating'

In [44]:
data[:20]

[{'rating': 5.0,
  'title': 'Perfect for my car seat stroller.',
  'text': 'I got a carseat stroller since I use lyft/Uber a lot and travel for work with my baby. Even though the car seat stroller is amazing it lacks the extra space compartments. And I suffer from back pain. So wearing a diaper backpack eventually weighs down on you.<br />This bag hooks up and fits perfectly in the back of the car seat stroller.<br /><br />The weight capacity is about 6.6lbs.<br />So it’s perfect for the minimalist moms, that pack the essentials, like diapers, snacks, maybe a change of clothes.<br /><br />Just remember to unhook it before collapsing the stroller into a car seat. It can get stuck and hard to get it back to the stroller position.',
  'images': [{'small_image_url': 'https://images-na.ssl-images-amazon.com/images/I/71IAuqQmjEL._SL256_.jpg',
    'medium_image_url': 'https://images-na.ssl-images-amazon.com/images/I/71IAuqQmjEL._SL800_.jpg',
    'large_image_url': 'https://images-na.ssl-image