In [1]:
import json
import matplotlib.pyplot as plt
import nltk
from tqdm import tqdm
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/yijingyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/yijingyang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
with open("Baby_Products.jsonl", "r") as f:
    dataset = [json.loads(line) for line in f]

with open("meta_Baby_Products.jsonl", "r") as f:
    meta = [json.loads(line) for line in f]

**Keep reviews that have at least 3 sentences.**

In [5]:
def keep_reviews_at_least_3_sentences(dataset):
    data = []
    for r in dataset:
        n_sent = len(nltk.sent_tokenize(r["text"]))
        if n_sent >= 3:
            data.append(r)
    return data
data = keep_reviews_at_least_3_sentences(dataset)
print("total reviews:", len(data))

total reviews: 2846294


In [6]:
def user_count(dataset):
    user_counts = {}
    for r in dataset:
        id = r["user_id"]
        if id not in user_counts:
            user_counts[id] = 1
        else:
            user_counts[id] += 1
    return user_counts

user_counts = user_count(dataset)

**Keep users that have the number of reviews between 5-10**

In [7]:
def keep_users_larger_between_5_10(dataset, user_counts):
    data = []
    for r in dataset:
        if user_counts[r["user_id"]] > 5 and user_counts[r["user_id"]] < 10:
            data.append(r)
    return data

data = keep_users_larger_between_5_10(dataset, user_counts)
print("total reviews:", len(data), "unique users:", len(user_count(data)))

total reviews: 614255 unique users: 87464


- organize the dataset by users : {user_id: , reviews: [review1, review2, ...],}
- order reviews each user by timestamp

In [8]:
def asin_product_pairs(metadata):
    meta = {}
    for p in metadata:
        asin = p["parent_asin"]
        meta[asin] = p["title"]
    return meta

meta = asin_product_pairs(meta)

In [9]:
def gen_user_review_item_data(dataset, metadata):
    data = []
    user_id_table = {}
    user_idx = 0
    for r in tqdm(dataset):
        title = r["title"]
        rating = r["rating"]
        review = r["text"]
        user_id = r["user_id"]
        parent_asin = r["parent_asin"]
        product_name = metadata[parent_asin]
        timestamp = r["timestamp"]
        
        review_info = {"product": product_name, "rating": rating, 
                       "title": title, "review": review}
        if user_id not in user_id_table:
            line = {"user_id": user_id, "reviews": {timestamp: review_info}}
            data.append(line)
            user_id_table[user_id] = user_idx
            user_idx += 1
        else:
            idx = user_id_table[user_id]
            data[idx]["reviews"][timestamp] = review_info
    return data
reviews = gen_user_review_item_data(data, meta)

100%|██████████| 614255/614255 [00:00<00:00, 1120272.88it/s]


**Order the reviews by timestamps**

In [10]:
def sort_reviews_by_timestamp(reviews):
    for r in reviews:
        r['reviews'] = [r['reviews'][time] for time in sorted(r['reviews'])]
    return reviews

reviews = sort_reviews_by_timestamp(reviews)        

In [11]:
reviews[5]

{'user_id': 'AFFFV4Y35FUSSYUMRHYV4P34X2MA',
 'reviews': [{'product': 'Cozy Cover Infant Car Seat Cover (Pink) - The Industry Leading Infant Carrier Cover Trusted by Over 6 Million Moms Worldwide for Keeping Your Baby Cozy & Warm',
   'rating': 5.0,
   'title': 'Easy to use',
   'review': 'Keeps my baby warm and out of the crap weather'},
  {'product': 'Munchkin Fresh Food Feeder, 2 Count (Pack of 1), Blue/Mint',
   'rating': 5.0,
   'title': 'Love it',
   'review': 'My son loves it can hold onto the feeder easily I can put whatever food into the mesh eater and he can safe chew! greet purchase!'},
  {'product': 'Humble Crew Toy Storage Organizer, Grey/Blue/Green/Yellow',
   'rating': 3.0,
   'title': 'Damaged',
   'review': 'I would give 5 stars if it wasn’t for wood having scratches all over'},
  {'product': 'Child Proof Cabinet Locks Child Safety Drawer Locks,Baby Proofing Child Locks for Cabinets Latch Oven Stove Toilet Cupboard with 3M Adhesive,12 Pack',
   'rating': 5.0,
   'title'

In [12]:
filename = "reviews.jsonl"
with open(filename, "w") as f:
    for r in reviews:
        json.dump(r, f)
        f.write("\n")

In [13]:
from datasets import load_dataset
r = load_dataset("json", data_files="reviews.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [14]:
r["train"][0]

{'user_id': 'AGVVUU3QRQBHNASSGI5YQLPYOI2Q',
 'reviews': [{'product': 'OXO Tot Sit Right Potty Seat, Green',
   'rating': 4.0,
   'title': 'takes up no room when storing',
   'review': 'seems ok and fits on toilet seat ok'},
  {'product': 'EZOWare 4 pc Cube Storage Basket Bins with Lid, Bamboo Fabric Linen Lidded Fabric Folding Boxes Cubes Containers - Beige, 9.8 x 9.8 x 9.8 inches',
   'rating': 5.0,
   'title': 'Very good looking',
   'review': 'These storage cubes are great. They have a more expensive look than most others.'},
  {'product': 'KidSwitch Light Switch Extender for Kids, Children, Toddlers - 3 Pack, Original Style - Award-Winning Child Safety Tool',
   'rating': 5.0,
   'title': 'Would reorder again and again as needed',
   'review': 'Installed upside down to reach light switch hidden was washer in utility room. works great'},
  {'product': 'Safety 1st Easy Install 28" Walk Thru Gate, Fits Between 29" and 38"',
   'rating': 5.0,
   'title': 'Best gate ever',
   'review': 