# **Mounting Google Drive to Collab**

In [2]:
from google.colab import drive
drive.mount('/content/drive')
csv_folder_path = '/content/drive/My Drive/Data_Mining_CSV'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Part 1**

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

Requirements

In [4]:
from itertools import islice
import sys
import pandas as pd
import json
import re
from datasets import load_dataset

print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)

Python version 3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]
Pandas version 2.2.2


In [5]:
# ------------------- Cleaning Functions -------------------

def normalize_price(price):
    if isinstance(price, str):
        match = re.search(r'[\d,.]+', price)
        if match:
            return float(match.group(0).replace(',', ''))
    return None

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text.strip()
    return ''

def flatten_image_struct(example):
    images_raw = example.get('images', '[]')
    try:
        images = json.loads(images_raw) if isinstance(images_raw, str) else images_raw
    except json.JSONDecodeError:
        images = []
    example['image_urls'] = [img.get('hi_res', '') for img in images if isinstance(img, dict)]
    return example

# Home and Kitchen

In [6]:
# ------------------- Load and Sample Data -------------------

Home_and_Kitchen_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Home_and_Kitchen", trust_remote_code=True, split="full", streaming=True)
Home_and_Kitchen_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Home_and_Kitchen", trust_remote_code=True, split="full", streaming=True)

# Get set amount of rows
hak_rows_to_get = 20000
hak_review_sample = list(islice(Home_and_Kitchen_review_ds, hak_rows_to_get))
Home_and_Kitchen_meta_ds = Home_and_Kitchen_meta_ds.map(flatten_image_struct)
hak_meta_sample = list(islice(Home_and_Kitchen_meta_ds, hak_rows_to_get))
pd.set_option('display.max_rows', hak_rows_to_get)
pd.set_option('display.max_colwidth', None)

# # Get all rows
# hak_review_sample = list(Home_and_Kitchen_review_ds)
# Home_and_Kitchen_meta_ds = Home_and_Kitchen_meta_ds.map(flatten_image_struct)
# hak_meta_sample = list(Home_and_Kitchen_meta_ds)

hak_reviews_df = pd.DataFrame(hak_review_sample)
hak_meta_df = pd.DataFrame(hak_meta_sample)

# ------------------- Clean Meta Dataset -------------------

clean_hak_meta_df = hak_meta_df.copy()

meta_cols_to_keep = [
    'parent_asin', 'main_category', 'product_title', 'average_rating', 'rating_number',
    'description', 'price', 'store', 'details'
]
clean_hak_meta_df = clean_hak_meta_df.rename(columns={'title': 'product_title'})
clean_hak_meta_df = clean_hak_meta_df.loc[:, meta_cols_to_keep]

# Drop incomplete entries
clean_hak_meta_df = clean_hak_meta_df.dropna(subset=['product_title', 'main_category', 'price']).copy()

# Clean 'description'
clean_hak_meta_df.loc[:, 'description'] = clean_hak_meta_df['description'].apply(
    lambda desc: clean_text(' '.join(desc)) if isinstance(desc, list)
    else clean_text(desc) if isinstance(desc, str)
    else 'No description available'
)

# Clean other fields
clean_hak_meta_df.loc[:, 'details'] = clean_hak_meta_df['details'].fillna('').astype(str).apply(clean_text)
clean_hak_meta_df.loc[:, 'average_rating'] = clean_hak_meta_df['average_rating'].fillna(0).astype(float)
clean_hak_meta_df.loc[:, 'rating_number'] = clean_hak_meta_df['rating_number'].fillna(0).astype(int)
clean_hak_meta_df.loc[:, 'price'] = clean_hak_meta_df['price'].apply(normalize_price)
clean_hak_meta_df = clean_hak_meta_df.dropna(subset=['price']).copy()
clean_hak_meta_df.loc[:, 'store'] = clean_hak_meta_df['store'].fillna('Unknown')
clean_hak_meta_df.loc[:, 'product_title'] = clean_hak_meta_df['product_title'].apply(clean_text)
clean_hak_meta_df.loc[:, 'parent_asin'] = clean_hak_meta_df['parent_asin'].astype(str)

clean_hak_meta_df.reset_index(drop=True, inplace=True)

# ------------------- Clean Reviews Dataset -------------------

clean_hak_reviews_df = hak_reviews_df.copy()

if 'images' in clean_hak_reviews_df.columns:
    clean_hak_reviews_df = clean_hak_reviews_df.drop(columns=['images'])

clean_hak_reviews_df = clean_hak_reviews_df.dropna(subset=['rating', 'text', 'asin', 'parent_asin']).copy()
clean_hak_reviews_df = clean_hak_reviews_df[clean_hak_reviews_df['verified_purchase'] == True].copy()

clean_hak_reviews_df.loc[:, 'title'] = clean_hak_reviews_df['title'].fillna('').apply(clean_text)
clean_hak_reviews_df.loc[:, 'text'] = clean_hak_reviews_df['text'].apply(clean_text)
clean_hak_reviews_df.loc[:, 'helpful_vote'] = clean_hak_reviews_df['helpful_vote'].fillna(0).astype(int)

# Convert timestamp from milliseconds to seconds (remove milliseconds)
clean_hak_reviews_df.loc[:, 'timestamp'] = (clean_hak_reviews_df['timestamp'].astype('int64') // 1000)
converted_timestamps = pd.to_datetime(clean_hak_reviews_df['timestamp'], unit='s', errors='coerce')
clean_hak_reviews_df = clean_hak_reviews_df.drop(columns=['timestamp'])
clean_hak_reviews_df['timestamp'] = converted_timestamps

# Split timestamp into date and time
clean_hak_reviews_df.loc[:, 'date'] = clean_hak_reviews_df['timestamp'].dt.date
clean_hak_reviews_df.loc[:, 'time'] = clean_hak_reviews_df['timestamp'].dt.strftime('%H:%M:%S')

# Drop original timestamp column
clean_hak_reviews_df.drop(columns=['timestamp'], inplace=True)

clean_hak_reviews_df.loc[:, 'parent_asin'] = clean_hak_reviews_df['parent_asin'].astype(str)
clean_hak_reviews_df.rename(columns={'title': 'review_title'}, inplace=True)
clean_hak_reviews_df.reset_index(drop=True, inplace=True)

# ------------------- Merge Datasets -------------------

clean_hak_merged_df = pd.merge(clean_hak_reviews_df, clean_hak_meta_df, on='parent_asin', how='left')
clean_hak_merged_df = clean_hak_merged_df.dropna().reset_index(drop=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Save to CSV

In [7]:
# Create the CSV file
clean_hak_merged_df.to_csv(f"{csv_folder_path}/Home_and_Kitchen.csv", index=False)

Load from CSV

In [8]:
# Load Gift Cards CSV file
home_and_kitchen = pd.read_csv(f"{csv_folder_path}/Home_and_Kitchen.csv")

# Show file
home_and_kitchen.head()

Unnamed: 0,rating,review_title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,date,time,main_category,product_title,average_rating,rating_number,description,price,store,details
0,5.0,five stars,bought this for entertaining its a fun piece its smaller than i expected though,B01M5AH9PQ,B078NWLG1P,AGXVBIUFLFGMVLATYXHJYL4A5Q7Q,0,True,2018-07-02,18:38:00,Amazon Home,coffeezone small candy nuts decoration ideas small porcelain dishes in letter shape wedding decoration letter t,4.3,1298.0,,8.99,Coffeezone,brand coffeezone color white special feature durability microwave safe occasion wedding seasons all seasons product dimensions 571 x 433 x 079 inches number of pieces 1 size letter t item weight 53 ounces is microwaveable yes item firmness description hard is dishwasher safe yes pattern letter shape manufacturer coffeezone best sellers rank kitchen dining 2160 candy servers 1 is discontinued by manufacturer no date first available december 27 2017
1,5.0,adorable,these are so sweet i do wish the stopper part was a little longer in length but they work great,B01HBWGU80,B01DR2ACA0,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,2019-07-23,04:29:16,Amazon Home,little bird wine bottle stopper silicone stoppers reusable leak proof cute fun decorative multipack assorted color set of 6,4.8,1187.0,,9.49,LouisChoice,color assorted color material silicone brand louischoice shape bird item dimensions lxwxh 5 x 35 x 2 inches is dishwasher safe no product dimensions 5 x 35 x 2 inches item weight 352 ounces manufacturer louischoice item model number lclb6 best sellers rank kitchen dining 64236 wine stoppers 176 is discontinued by manufacturer no date first available april 2 2016
2,4.0,bed is very comfortable for a marshmallow,when i ordered this mattress i thought it was a medium firm turns out its a medium plush be careful reading the description of the mattress we needed a medium firm bed due to health issues easier to get out of a firm bed than a marshmallow even sitting on the edge of the bed was a nightmare unless you planted your feet firmly on the floor you were going to slide off the mattress plush is nice for snuggly sleepers we are not this was bed 2 in a list of 4 beds we tried out before we finally got it right shopping online is nothing new but you cant sit on the bed and feel the firmness or plush featured on your mattress in the same breath getting out right now under the virus threat is very scary key words for mattress purchase soft plush medium plush or medium firm trial and error even online,B07BV2PTJQ,B098DXB21S,AFE337D2J37YRU5U6MVTVKNDKWDA,0,True,2020-04-03,18:23:56,Amazon Home,classic brands cool gel chill memory foam 14inch mattress with pillow certipurus certified bedinabox twin,4.5,29979.0,experience cloudlike luxury with our classic brands cool gel ultimate gel memory foam 14 inch mattress this mattress offers a plush and incredibly comfortable sleeping surface starting with a whopping 35inch top layer which consists of breathable ventilated cool gel memory foam that conforms to your shape and has a faster recovery factor than conventional memory foam a second layer of 2inch memory foam works with the top layer for comfort and support providing you with the ultimate sleep experience the 85inch base layer maintains the strength and effectiveness of the layers know the size and height of your mattress ahead of purchase to make sure you are ordering the correct size keep in mind that you should combine the height of your bed frame foundation or adjustable bed base with the height of your mattress to get an approximate idea our advanced packaging technology means we can rollpack our mattresses and ship them at onethird of the normal size a mattress that is rollpacked still retains its initial quality as each will quickly recover and return to its original shape when opened look for more innovative specialty sleep products from classic brands our mattresses are certipurus certified certified flexible polyurethane foams have been independently laboratory tested more resilient foam to prevent tossing and turning,472.77,Classic Brands,product dimensions 75l x 39w x 14th item weight 59 pounds manufacturer classic brands country of origin china item model number 4101688010 best sellers rank home kitchen 19827 mattresses 96 thickness 14 inches refill gel memory foam assembly required no number of pieces 1 batteries required no included components new 2018 cool gel ultimate gel memory foam 14inch mattress with bonus pillow twin size twin item firmness description plush fill material gel memory foam brand classic brands top style tight top color white age range description adult model name cool gel 20 chill construction type gel foam
3,5.0,great sheets,i have these sheets in two colors they are great very soft hold up well after washing this set is a very dark navy almost black i wash them on their own in part because they are so dark and in part because king size sheets are big im really glad i got two colors,B07XVD3CK5,B082P23BMP,AGBFYI2DDIKXC5Y4FARTYDTQBMFQ,0,True,2020-01-08,23:57:31,Amazon Home,400 thread count cotton cal king fitted sheets pastel pink 1pc 100 long staple pure cotton smooth sateen fitted sheet fit upto 15 inch deep pockets black 100 cotton fitted sheet cal king,4.5,14104.0,enjoy a luxurious experience with the pizuna 400 thread count fitted sheet you get a pleasant feel and your skin breathes naturally sleeping on the deep pocket queen sheets keeping you cool during summers and warm in winters it is breathable and sweatabsorbent the 100 long staple cotton is cozy to let you enjoy a goodnights sleep feel your skin breathe naturally with the smooth and gentle feel of the sateen weave the fitted sheet queen has 360 elastic to stay firm to the mattress even though you turn and toss a lot furthermore the patentedstitching and full deep pockets effortlessly fit up to 15inch deep mattresses there is a smart side tag so you put on the fitted bed sheets without any inconvenience with an excellent range of wide variety of colors from pizuna you can effortlessly stylize your bedroom in a way you have always wanted give your bedroom dcor the plush feel and look with the special sateen weave the 4inch hem gives an elegant touch to the queen sheets deep pocket to enhance your sleeping space our fitted bed sheets are meticulously made and designed sustainably and responsibly to give you an indulgent sleeping experience every day the 400 thread count fitted sheet queen is individually tailored measured and quality checked piece by piece each piece is inspected before packaged from our bsci compliant factory,39.99,Pizuna,best sellers rank home kitchen 2990 flat bed sheets 6 sheet pillowcase sets 52 date first available february 28 2023 color black size california king pattern solid brand pizuna
4,5.0,five stars,love the darl gray color fit is great on a tall mattress well made,B019J3R8E4,B019J27MFU,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,0,True,2018-06-15,00:50:44,Amazon Home,fitted bottom sheet full size premium 1800 microfiber ultrasoft hotel luxury deep pocket durable and long lasting full fitted sheet full sand,4.6,46121.0,,14.99,Bare Home,color 07 sand material microfiber size full fabric type 100 hydrobrushed microfiber yarns pattern solid product dimensions 115 x 8 x 25 inches brand bare home thread count 1800 care instructions machine wash cold tumble dry low number of pieces 1 item weight 13 pounds manufacturer bare home item model number 643665961745 best sellers rank home kitchen 672 fitted bed sheets 3 is discontinued by manufacturer no batteries required no included components 1 fitted sheet


# Automotive

In [9]:
# ------------------- Load and Sample Data -------------------

Automotive_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Automotive", trust_remote_code=True, split="full", streaming=True)
Automotive_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Automotive", trust_remote_code=True, split="full", streaming=True)

# Get set amount of rows
a_rows_to_get = 20000
a_review_sample = list(islice(Automotive_review_ds, a_rows_to_get))
Automotive_meta_ds = Automotive_meta_ds.map(flatten_image_struct)
a_meta_sample = list(islice(Automotive_meta_ds, a_rows_to_get))
pd.set_option('display.max_rows', a_rows_to_get)
pd.set_option('display.max_colwidth', None)

# # Get all rows
# a_review_sample = list(Automotive_review_ds)
# Automotive_meta_ds = Automotive_meta_ds.map(flatten_image_struct)
# a_meta_sample = list(Automotive_meta_ds)

a_reviews_df = pd.DataFrame(a_review_sample)
a_meta_df = pd.DataFrame(a_meta_sample)

# ------------------- Clean Meta Dataset -------------------

clean_a_meta_df = a_meta_df.copy()

meta_cols_to_keep = [
    'parent_asin', 'main_category', 'product_title', 'average_rating', 'rating_number',
    'description', 'price', 'store', 'details'
]
clean_a_meta_df = clean_a_meta_df.rename(columns={'title': 'product_title'})
clean_a_meta_df = clean_a_meta_df.loc[:, meta_cols_to_keep]

# Drop incomplete entries
clean_a_meta_df = clean_a_meta_df.dropna(subset=['product_title', 'main_category', 'price']).copy()

# Clean 'description'
clean_a_meta_df.loc[:, 'description'] = clean_a_meta_df['description'].apply(
    lambda desc: clean_text(' '.join(desc)) if isinstance(desc, list)
    else clean_text(desc) if isinstance(desc, str)
    else 'No description available'
)

# Clean other fields
clean_a_meta_df.loc[:, 'details'] = clean_a_meta_df['details'].fillna('').astype(str).apply(clean_text)
clean_a_meta_df.loc[:, 'average_rating'] = clean_a_meta_df['average_rating'].fillna(0).astype(float)
clean_a_meta_df.loc[:, 'rating_number'] = clean_a_meta_df['rating_number'].fillna(0).astype(int)
clean_a_meta_df.loc[:, 'price'] = clean_a_meta_df['price'].apply(normalize_price)
clean_a_meta_df = clean_a_meta_df.dropna(subset=['price']).copy()
clean_a_meta_df.loc[:, 'store'] = clean_a_meta_df['store'].fillna('Unknown')
clean_a_meta_df.loc[:, 'product_title'] = clean_a_meta_df['product_title'].apply(clean_text)
clean_a_meta_df.loc[:, 'parent_asin'] = clean_a_meta_df['parent_asin'].astype(str)

clean_a_meta_df.reset_index(drop=True, inplace=True)

# ------------------- Clean Reviews Dataset -------------------

clean_a_reviews_df = a_reviews_df.copy()

if 'images' in clean_a_reviews_df.columns:
    clean_a_reviews_df = clean_a_reviews_df.drop(columns=['images'])

clean_a_reviews_df = clean_a_reviews_df.dropna(subset=['rating', 'text', 'asin', 'parent_asin']).copy()
clean_a_reviews_df = clean_a_reviews_df[clean_a_reviews_df['verified_purchase'] == True].copy()

clean_a_reviews_df.loc[:, 'title'] = clean_a_reviews_df['title'].fillna('').apply(clean_text)
clean_a_reviews_df.loc[:, 'text'] = clean_a_reviews_df['text'].apply(clean_text)
clean_a_reviews_df.loc[:, 'helpful_vote'] = clean_a_reviews_df['helpful_vote'].fillna(0).astype(int)

# Convert timestamp from milliseconds to seconds (remove milliseconds)
clean_a_reviews_df.loc[:, 'timestamp'] = (clean_a_reviews_df['timestamp'].astype('int64') // 1000)
converted_timestamps = pd.to_datetime(clean_a_reviews_df['timestamp'], unit='s', errors='coerce')
clean_a_reviews_df = clean_a_reviews_df.drop(columns=['timestamp'])
clean_a_reviews_df['timestamp'] = converted_timestamps

# Split timestamp into date and time
clean_a_reviews_df.loc[:, 'date'] = clean_a_reviews_df['timestamp'].dt.date
clean_a_reviews_df.loc[:, 'time'] = clean_a_reviews_df['timestamp'].dt.strftime('%H:%M:%S')

# Drop original timestamp column
clean_a_reviews_df.drop(columns=['timestamp'], inplace=True)

clean_a_reviews_df.loc[:, 'parent_asin'] = clean_a_reviews_df['parent_asin'].astype(str)
clean_a_reviews_df.rename(columns={'title': 'review_title'}, inplace=True)
clean_a_reviews_df.reset_index(drop=True, inplace=True)

# ------------------- Merge Datasets -------------------

clean_a_merged_df = pd.merge(clean_a_reviews_df, clean_a_meta_df, on='parent_asin', how='left')
clean_a_merged_df = clean_a_merged_df.dropna().reset_index(drop=True)

Save to CSV

In [10]:
# Create the CSV file
clean_a_merged_df.to_csv(f"{csv_folder_path}/Automotive.csv", index=False)

Load from CSV

In [11]:
# Load Gift Cards CSV file
automotive_df = pd.read_csv(f"{csv_folder_path}/Automotive.csv")

# Show file
automotive_df.head()

Unnamed: 0,rating,review_title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,date,time,main_category,product_title,average_rating,rating_number,description,price,store,details
0,5.0,perfect,we love them,B08GFQNZNF,B0C2YXN878,AHX4XWVVQUKT3FCNWCVASDF4Q56Q,0,True,2022-02-27,23:19:54,Automotive,sukemichi roll bar paracord grab handles grip handles for jeep wrangler jk tj jl cj yj 19872023 jt utv atv flaggreen,4.7,590.0,,19.99,Sukemichi,material oxford cloth brand sukemichi color green style grab handles handle type pull handle specific uses for product door shape oblong included components umbrella rope handles and grab handles finish type polished package type standard packaging unit count 2 count manufacturer maite part number bzlsflag item weight 88 ounces product dimensions 1063 x 512 x 236 inches item model number for jeep 19812020 tj jk jl jt atv utv size 2 piece finish polished item package quantity 2 number of handles 2 batteries included no batteries required no best sellers rank automotive 31279 automotive grab handles 59 date first available june 19 2020
1,5.0,soft utility brush,i purchased this to do a soft scrub outside project and it worked perfectly the bristles are soft and held the soap and water well the bristles are bright green as shown and the handle is plastic it feels like it will last a long time i never hesitate to update my reviews should new info seem useful,B00MI59W90,B00MI59W90,AFZUK3MTBIBEDQOPAK3OATUOUKLA,1,True,2021-08-05,17:27:24,Amazon Home,brush soft utility scrub green short,4.6,879.0,brush soft utility scrub green short soft utility scrub brush green short bristles are stapleset into a sturdy block flaggedtip bristles are resistant to acids and detergents polystyrene bristles designed to hold water and cleaning solution while safely washing away dirt and grime from any surface on your car warning this product can expose you to chemicals which is are known to the state of california to cause cancer and birth defects or other reproductive harm,11.49,S.M. Arnold,brand sm arnold color green handle material wood product dimensions 56l x 27w x 89h number of items 1 item weight 8 ounces manufacturer sm arnold item model number 85608 best sellers rank automotive 5761 cleaning brushes dusters 75 is discontinued by manufacturer no batteries required no
2,5.0,five stars,used the last two years and they work great,B002KQ86EY,B01MT7ZO0S,AEDRFOIY6IFY7DWSPRRNGEFIFE2Q,0,True,2016-04-15,12:44:41,Automotive,security chain traction chain qg20030,4.4,2197.0,security chain traction chain qg20030,16.23,Security Chain,manufacturer security chain brand security chain item weight 03 pounds product dimensions 098 x 098 x 098 inches item model number qg20030 is discontinued by manufacturer no material type rubber number of items 1 size small manufacturer part number qg20030 best sellers rank automotive 36290 commercial truck snow chains 1 passenger car snow chains 13 date first available august 5 2009 material rubber vehicle service type forklift
3,5.0,very nice and excellent quality,great way to organize your sockets,B01MEHAGC2,B01G31X5H4,AGEP2CC2QIJU5IHQZXCC7BE5JSDQ,0,True,2018-01-03,01:53:51,Automotive,olsa tools 12inch drive aluminum socket organizer premium quality socket holder orange,4.7,8481.0,,13.87,Olsa Tools,brand olsa tools item dimensions lxwxh 17 x 1 x 081 inches size 12inch drive material aluminum drive system square socket finish type polished number of pieces 1 item length 17 inches manufacturer olsa tools model olsa tools aluminum socket holder rail item weight 58 ounces product dimensions 17 x 1 x 081 inches country of origin taiwan item model number 1109 is discontinued by manufacturer no manufacturer part number alsr oem part number 1109 best sellers rank automotive 1837 tool trays 3 date first available march 19 2018
4,5.0,maxtray cargo liner,max liner maxtray cargo liner fits perfectly and i like the quality,B0051UGQ1G,B0051UGQ1G,AHLQJVWRY6DXKJNTAP4UFXK5WLAQ,0,True,2016-04-04,14:54:30,Automotive,maxliner all weather cargo liner floor mat black for 20072014 ford edge 20112015 lincoln mkx,4.2,55.0,the maxliner cargo liner prevents stains or mess of any kind from harming your cargo area with this cargo liner you can keep the carpet of your cargo area immaculately spic and span while retaining the level of its functionality made with lowdensity material this heavyduty liner also includes a raised lip and molded outer edge to shield your carpet from dirt and grime oil spills paint supplies and other corrosive messes that wear out carpet its nonslip textured surface characterized by intricate ridged patterns is especially designed to keep the liner from slipping the maxliner cargo liner comes in maximum size for extra protection and offers not just mere protection but also attractiveness to your cargo areareasons to choose maxliner far more affordable compared with other brands all weather protection against snow mud sand salt water and more spill saver lip prevents spills from reaching carpet easy to clean and maintain detachable from vehicleall maxlinerusa products are meticulously engineered to fit each car exactly right,92.69,MAX LINER,manufacturer maxliner usa brand max liner model d0047 item weight 598 pounds product dimensions 9 x 48 x 9 inches item model number d0047 exterior smooth manufacturer part number d0047 oem part number d0047 position rear best sellers rank automotive 1092610 cargo liners 3411 date first available may 20 2011 color black vehicle service type truck auto part position rear fit type vehicle specific fit


# Electronics

In [12]:
# ------------------- Load and Sample Data -------------------

Electronics_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True, split="full", streaming=True)
Electronics_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True, split="full", streaming=True)

# Get set amount of rows
e_rows_to_get = 20000
e_review_sample = list(islice(Electronics_review_ds, e_rows_to_get))
Electronics_meta_ds = Electronics_meta_ds.map(flatten_image_struct)
e_meta_sample = list(islice(Electronics_meta_ds, e_rows_to_get))
pd.set_option('display.max_rows', e_rows_to_get)
pd.set_option('display.max_colwidth', None)

# # Get all rows
# e_review_sample = list(Electronics_review_ds)
# Electronics_meta_ds = Electronics_meta_ds.map(flatten_image_struct)
# e_meta_sample = list(Electronics_meta_ds)

e_reviews_df = pd.DataFrame(e_review_sample)
e_meta_df = pd.DataFrame(e_meta_sample)

# ------------------- Clean Meta Dataset -------------------

clean_e_meta_df = e_meta_df.copy()

meta_cols_to_keep = [
    'parent_asin', 'main_category', 'product_title', 'average_rating', 'rating_number',
    'description', 'price', 'store', 'details'
]
clean_e_meta_df = clean_e_meta_df.rename(columns={'title': 'product_title'})
clean_e_meta_df = clean_e_meta_df.loc[:, meta_cols_to_keep]

# Drop incomplete entries
clean_e_meta_df = clean_e_meta_df.dropna(subset=['product_title', 'main_category', 'price']).copy()

# Clean 'description'
clean_e_meta_df.loc[:, 'description'] = clean_e_meta_df['description'].apply(
    lambda desc: clean_text(' '.join(desc)) if isinstance(desc, list)
    else clean_text(desc) if isinstance(desc, str)
    else 'No description available'
)

# Clean other fields
clean_e_meta_df.loc[:, 'details'] = clean_e_meta_df['details'].fillna('').astype(str).apply(clean_text)
clean_e_meta_df.loc[:, 'average_rating'] = clean_e_meta_df['average_rating'].fillna(0).astype(float)
clean_e_meta_df.loc[:, 'rating_number'] = clean_e_meta_df['rating_number'].fillna(0).astype(int)
clean_e_meta_df.loc[:, 'price'] = clean_e_meta_df['price'].apply(normalize_price)
clean_e_meta_df = clean_e_meta_df.dropna(subset=['price']).copy()
clean_e_meta_df.loc[:, 'store'] = clean_e_meta_df['store'].fillna('Unknown')
clean_e_meta_df.loc[:, 'product_title'] = clean_e_meta_df['product_title'].apply(clean_text)
clean_e_meta_df.loc[:, 'parent_asin'] = clean_e_meta_df['parent_asin'].astype(str)

clean_e_meta_df.reset_index(drop=True, inplace=True)

# ------------------- Clean Reviews Dataset -------------------

clean_e_reviews_df = e_reviews_df.copy()

if 'images' in clean_e_reviews_df.columns:
    clean_e_reviews_df = clean_e_reviews_df.drop(columns=['images'])

clean_e_reviews_df = clean_e_reviews_df.dropna(subset=['rating', 'text', 'asin', 'parent_asin']).copy()
clean_e_reviews_df = clean_e_reviews_df[clean_e_reviews_df['verified_purchase'] == True].copy()

clean_e_reviews_df.loc[:, 'title'] = clean_e_reviews_df['title'].fillna('').apply(clean_text)
clean_e_reviews_df.loc[:, 'text'] = clean_e_reviews_df['text'].apply(clean_text)
clean_e_reviews_df.loc[:, 'helpful_vote'] = clean_e_reviews_df['helpful_vote'].fillna(0).astype(int)

# Convert timestamp from milliseconds to seconds (remove milliseconds)
clean_e_reviews_df.loc[:, 'timestamp'] = (clean_e_reviews_df['timestamp'].astype('int64') // 1000)
converted_timestamps = pd.to_datetime(clean_e_reviews_df['timestamp'], unit='s', errors='coerce')
clean_e_reviews_df = clean_e_reviews_df.drop(columns=['timestamp'])
clean_e_reviews_df['timestamp'] = converted_timestamps

# Split timestamp into date and time
clean_e_reviews_df.loc[:, 'date'] = clean_e_reviews_df['timestamp'].dt.date
clean_e_reviews_df.loc[:, 'time'] = clean_e_reviews_df['timestamp'].dt.strftime('%H:%M:%S')

# Drop original timestamp column
clean_e_reviews_df.drop(columns=['timestamp'], inplace=True)

clean_e_reviews_df.loc[:, 'parent_asin'] = clean_e_reviews_df['parent_asin'].astype(str)
clean_e_reviews_df.rename(columns={'title': 'review_title'}, inplace=True)
clean_e_reviews_df.reset_index(drop=True, inplace=True)

# ------------------- Merge Datasets -------------------

clean_e_merged_df = pd.merge(clean_e_reviews_df, clean_e_meta_df, on='parent_asin', how='left')
clean_e_merged_df = clean_e_merged_df.dropna().reset_index(drop=True)

Save to CSV

In [13]:
# Create the CSV file
clean_e_merged_df.to_csv(f"{csv_folder_path}/Electronics.csv", index=False)

Load from CSV

In [14]:
# Load Gift Cards CSV file
electronics_df = pd.read_csv(f"{csv_folder_path}/Electronics.csv")

# Show file
electronics_df.head()

Unnamed: 0,rating,review_title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,date,time,main_category,product_title,average_rating,rating_number,description,price,store,details
0,1.0,junk,to me this was a waste of money not useful on a plane or really anywhere too short once bent leaving no surface space so phone wants to fall off dont buy it save your money,B072MBPWMV,B07VV6TT69,AFEAJQIVVIZCTKQNNHD73H7VQSPQ,0,True,2020-04-29,14:46:45,Cell Phones & Accessories,flight flap phone tablet holder designed for air travel flying traveling inflight stand compatible with iphone compatible with android and compatible with kindle mobile devices xl,4.3,822.0,,22.99,Flight Flap,package dimensions 119 x 7 x 04 inches item weight 32 ounces item model number 8541528899 best sellers rank unique finds 2443 unique electronics 113 tablet stands 143 is discontinued by manufacturer no date first available april 25 2018 manufacturer flight flap brand flight flap color xl form factor bag compatible devices laptops compatible phone models iphone
1,5.0,love it it fits my chromebook perfectly,love it it fits my chromebook perfectly,B00J39BYSG,B07BJ8KD6X,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,0,True,2015-06-30,21:27:51,Computers,meffort inc 116 inch neoprene laptopultrabookchromebook bag carrying sleeve with hidden handle and adjustable shoulder strap klimt tree of life,4.5,513.0,meffort inc is a world registered trademark our product is unique trendy and functional they are made from durable neoprene that is naturally tear and damage resistant the interior is also neoprene so it naturally offers a level of nonscratch padding the reinforced dual zippers will keep your device secure and safe slim lightweight and portable so itll fit into your bag backpack messenger bag or use it as a carrying bag by itself,17.95,Meffort Inc,product dimensions 122 x 059 x 925 inches item weight 10 ounces best sellers rank laptop messenger shoulder bags 7041 date first available march 18 2014 department unisexadult manufacturer meffort inc
2,1.0,customer service is not okay,set up is very easy the video is pretty good i paid for a year of cam plus and i am still only getting 12 second videos the 12 second videos tend to be the person walking away not helpful i did a chat with wyze support their suggestion did not help i called and talked with support their suggestions did not help he told me he would send a request to tier 2 and they will contact me by email they never did its been a full week customer service is a huge deal i dont want this camera if i cant get more than 12 seconds of video,B08FFJWQY1,B09V28P31X,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0,True,2021-10-08,18:07:53,Camera & Photo,wyze cam outdoor starter bundle base station and 1 cam 1080p hd indooroutdoor wirefree smart home camera with night vision 2way audio works with alexa google assistant white with camo skin,4.3,36899.0,our customers have inspired us over the last two years with videos of meteors racing across the sky sunsets over mountain ranges we thought only existed in movies and visits from mother nature in the form of wildlife and spectacular lightning storms theyve stretched the limits of our indoor camera with use cases we never anticipated and conditions we thought impossible they made makeshift outdoor housings out of milk jugs 3d printings and bird houses theyre tenacity to go beyond our prescribed instruction has made it clear we need to meet them with a camera that can keep up with their curiosity capturing the world cant happen when youre limited to your living room so we created a camera that goes beyond that we spent two years on phone calls that carried into the next morning on international flights where sleep became a luxury and on user interviews listening to what you wanted to see in an outdoor camera all leading up to a camera worthy of looking after your home and those within it we cut wires removed boundaries and lab tested under every extreme condition imaginable weve built you a battlehardened camera capable of going places your indoor camera wouldnt even dream aboutthriving in the rains of seattle all the way to the heatwaves of miami and everything in between performancepacked hardware and software found in cameras 3x the price werent added to win an award they were added to earn your respect sensor 127,66.5,WYZE,brand wyze connectivity technology wireless video capture resolution 1080p special feature night vision motion sensor number of channels 4 color white power source battery powered item dimensions lxwxh 64 x 32 x 29 inches recommended uses for product home security compatible devices cameras signal format analog other camera features front low light technology night color product dimensions 64 x 32 x 29 inches item weight 12 pounds item model number wvod1b1 batteries 1 lithium polymer batteries required included best sellers rank home security systems 41 bullet surveillance cameras 47 date first available august 8 2020 manufacturer wyze labs inc country of origin china
3,5.0,right size for kids kindle,after i received the butterfly bag i was not sure if my great nieces kindle with the big rubber protector would fit but it fits perfectly in this bag,B00S7IBV82,B07HHY1MWS,AHLC5H2CEWSAOXOA53VIIROLPZKA,2,True,2017-10-12,11:23:16,Computers,aupet cute wish cat universal 6 7 inch tablet portable neoprene zipper carrying sleeve case bag,4.6,2208.0,,9.88,AUPET,standing screen display size 8 inches brand aupet item model number sleeves item weight 319 ounces package dimensions 921 x 677 x 079 inches color cute wish cat manufacturer professionalbags is discontinued by manufacturer no date first available december 8 2016 best sellers rank tablet sleeves 1964 compatible devices tablets form factor case shell type soft
4,5.0,best quality great price,well made bag which will only get better the older it gets love it my grandson was thrilled with this bag when i gave it to him people who saw it believed it cost way more than it did great purchase,B078BX25RR,B09DTP4L96,AH6PLOGWYIVIWLJTY756BHNFD4YA,0,True,2020-09-05,01:03:40,Computers,cuero dhk 14 inch vintage handmade leather messenger bag for laptop briefcase best computer satchel distressed bag,4.4,14337.0,,59.99,cuero,standing screen display size 14 inches brand cuero item model number 8541573052 item weight 154 pounds product dimensions 14 x 4 x 11 inches item dimensions lxwxh 14 x 4 x 11 inches color brown department unisexadult manufacturer cuero country of origin india is discontinued by manufacturer no date first available march 28 2018 best sellers rank laptop messenger shoulder bags 121


# Health and Household

In [15]:
# ------------------- Load and Sample Data -------------------

Health_and_Household_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Health_and_Household", trust_remote_code=True, split="full", streaming=True)
Health_and_Household_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Health_and_Household", trust_remote_code=True, split="full", streaming=True)

# Get set amount of rows
hah_rows_to_get = 20000
hah_review_sample = list(islice(Health_and_Household_review_ds, hah_rows_to_get))
Health_and_Household_meta_ds = Health_and_Household_meta_ds.map(flatten_image_struct)
hah_meta_sample = list(islice(Health_and_Household_meta_ds, hah_rows_to_get))
pd.set_option('display.max_rows', hah_rows_to_get)
pd.set_option('display.max_colwidth', None)

# # Get all rows
# hah_review_sample = list(Health_and_Household_review_ds)
# Health_and_Household_meta_ds = Health_and_Household_meta_ds.map(flatten_image_struct)
# hah_meta_sample = list(Health_and_Household_meta_ds)

hah_reviews_df = pd.DataFrame(hah_review_sample)
hah_meta_df = pd.DataFrame(hah_meta_sample)

# ------------------- Clean Meta Dataset -------------------

clean_hah_meta_df = hah_meta_df.copy()

meta_cols_to_keep = [
    'parent_asin', 'main_category', 'product_title', 'average_rating', 'rating_number',
    'description', 'price', 'store', 'details'
]
clean_hah_meta_df = clean_hah_meta_df.rename(columns={'title': 'product_title'})
clean_hah_meta_df = clean_hah_meta_df.loc[:, meta_cols_to_keep]

# Drop incomplete entries
clean_hah_meta_df = clean_hah_meta_df.dropna(subset=['product_title', 'main_category', 'price']).copy()

# Clean 'description'
clean_hah_meta_df.loc[:, 'description'] = clean_hah_meta_df['description'].apply(
    lambda desc: clean_text(' '.join(desc)) if isinstance(desc, list)
    else clean_text(desc) if isinstance(desc, str)
    else 'No description available'
)

# Clean other fields
clean_hah_meta_df.loc[:, 'details'] = clean_hah_meta_df['details'].fillna('').astype(str).apply(clean_text)
clean_hah_meta_df.loc[:, 'average_rating'] = clean_hah_meta_df['average_rating'].fillna(0).astype(float)
clean_hah_meta_df.loc[:, 'rating_number'] = clean_hah_meta_df['rating_number'].fillna(0).astype(int)
clean_hah_meta_df.loc[:, 'price'] = clean_hah_meta_df['price'].apply(normalize_price)
clean_hah_meta_df = clean_hah_meta_df.dropna(subset=['price']).copy()
clean_hah_meta_df.loc[:, 'store'] = clean_hah_meta_df['store'].fillna('Unknown')
clean_hah_meta_df.loc[:, 'product_title'] = clean_hah_meta_df['product_title'].apply(clean_text)
clean_hah_meta_df.loc[:, 'parent_asin'] = clean_hah_meta_df['parent_asin'].astype(str)

clean_hah_meta_df.reset_index(drop=True, inplace=True)

# ------------------- Clean Reviews Dataset -------------------

clean_hah_reviews_df = hah_reviews_df.copy()

if 'images' in clean_hah_reviews_df.columns:
    clean_hah_reviews_df = clean_hah_reviews_df.drop(columns=['images'])

clean_hah_reviews_df = clean_hah_reviews_df.dropna(subset=['rating', 'text', 'asin', 'parent_asin']).copy()
clean_hah_reviews_df = clean_hah_reviews_df[clean_hah_reviews_df['verified_purchase'] == True].copy()

clean_hah_reviews_df.loc[:, 'title'] = clean_hah_reviews_df['title'].fillna('').apply(clean_text)
clean_hah_reviews_df.loc[:, 'text'] = clean_hah_reviews_df['text'].apply(clean_text)
clean_hah_reviews_df.loc[:, 'helpful_vote'] = clean_hah_reviews_df['helpful_vote'].fillna(0).astype(int)

# Convert timestamp from milliseconds to seconds (remove milliseconds)
clean_hah_reviews_df.loc[:, 'timestamp'] = (clean_hah_reviews_df['timestamp'].astype('int64') // 1000)
converted_timestamps = pd.to_datetime(clean_hah_reviews_df['timestamp'], unit='s', errors='coerce')
clean_hah_reviews_df = clean_hah_reviews_df.drop(columns=['timestamp'])
clean_hah_reviews_df['timestamp'] = converted_timestamps

# Split timestamp into date and time
clean_hah_reviews_df.loc[:, 'date'] = clean_hah_reviews_df['timestamp'].dt.date
clean_hah_reviews_df.loc[:, 'time'] = clean_hah_reviews_df['timestamp'].dt.strftime('%H:%M:%S')

# Drop original timestamp column
clean_hah_reviews_df.drop(columns=['timestamp'], inplace=True)

clean_hah_reviews_df.loc[:, 'parent_asin'] = clean_hah_reviews_df['parent_asin'].astype(str)
clean_hah_reviews_df.rename(columns={'title': 'review_title'}, inplace=True)
clean_hah_reviews_df.reset_index(drop=True, inplace=True)

# ------------------- Merge Datasets -------------------

clean_hah_merged_df = pd.merge(clean_hah_reviews_df, clean_hah_meta_df, on='parent_asin', how='left')
clean_hah_merged_df = clean_hah_merged_df.dropna().reset_index(drop=True)

Save to CSV

In [16]:
# Create the CSV file
clean_hah_merged_df.to_csv(f"{csv_folder_path}/Health_and_Household.csv", index=False)

Load from CSV

In [17]:
# Load Gift Cards CSV file
health_and_household_df = pd.read_csv(f"{csv_folder_path}/Health_and_Household.csv")

# Show file
health_and_household_df.head()

Unnamed: 0,rating,review_title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,date,time,main_category,product_title,average_rating,rating_number,description,price,store,details
0,5.0,works great for people sensitive to perfumed,my kids i are very sensitive to scents perfumes this detergent works well to clean out clothes has no scents been using it for years,B07FVX9Z9H,B088CHH37R,AGXVBIUFLFGMVLATYXHJYL4A5Q7Q,1,True,2019-12-30,14:20:28,Health & Personal Care,seventh generation concentrated laundry detergent liquid free clear fragrance free 40 oz,4.6,9493.0,clothes should feel cuddly and clean against your skin and be good to it too seventh generation free clear concentrated laundry detergent features a tripleenzyme formula that fights tough stains without the use of fragrances dyes and artificial brighteners plantderived enzyme stain removers like protease and amylase power away stubborn stains grass tomato sauce or chocolate you name it weve got you covered this hypoallergenic and fragrancefree laundry detergent liquid is dermatologist tested to be gentle on skin our ultraconcentrated laundry soap uses less added water allowing us to use a smaller bottle to deliver impressive cleaning power 53 loads in a 40 oz container good things really do come in small packages for more than 25 years seventh generation has been thoughtfully formulating safe and effective plantbased products that work really well as a leading green laundry detergent seventh generation manufacturer we are proud to make biodegradable and epa safer choice certified detergents you may also notice a cute leaping bunny seal on our products that means we never test our products on animals and our products do not contain animalbased ingredients we design our products to be safe for people their homes and the environment based in vermont seventh generation is proud to be a certified b corporation b corps are certified to be better for workers better for communities and better for the environment by choosing seventh generation products youre joining us in nurturing the health of the next seven generations,14.99,Seventh Generation,brand seventh generation item form liquid scent free clear unit count 400 ounce material type free fragrance freefragrancefree product dimensions 375 x 575 x 11 inches 325 pounds date first available august 22 2018 manufacturer unilever country of origin usa domestic shipping currently item can be shipped only within the us and to apofpo addresses for apofpo shipments please check with the manufacturer regarding warranty and support issues international shipping this item is not eligible for international shipping learn more
1,4.0,good but overly sweet,4 stars on flavor only because i feel its too sweet i have to add more water to dilute the sweetnessbr 4 stars on the ingredients because i would like to see more protein from other plant sources besides peabr 5 stars on blendability many plant based powders have a chalky effect and this one is one of the better blendsbr all in all i will finish the tub that i purchased but really wish this came with a lot less sweetness or even sugar free,B0767Z2Y25,B0C533RGW5,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,2019-09-06,15:24:50,Health & Personal Care,kos vegan protein powder chocolate usda organic low carb pea protein blend plant based superfood with vitamins minerals keto soy gluten free meal replacement for women men 10 servings,4.1,7795.0,,19.99,KOS,brand kos flavor chocolate item weight 1375 ounces item form powder diet type plant based product dimensions 475 x 475 x 65 inches 1375 ounces item model number vppch10 date first available december 23 2020 manufacturer kos country of origin usa
2,5.0,five stars,great energy,B000SHOBQE,B00TGDU994,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,2018-05-26,21:52:30,Health & Personal Care,dr schulzes superfood plus vitamin and mineral herbal concentrate daily nutrition glutenfree and nongmo vegan 14 ounce powder packaging may vary,4.6,1701.0,dr schulzes superfood plus blasts your body with a rich supply of nutrients that are formulated to increase energy and overall vitality superfood plus is also protein rich for your daily workout needs with over 40 protein by volumeherbal ingredientswild harvested spirulina algae protein antioxidants bvitamins and ironorganic bluegreen algae dietary protein bvitamins and ironchlorella brokencell algae used to improve digestion and to colonrelated diseasesorganic barley grass vitamins minerals fiber protein enzymes and amino acidsorganic alfalfa grass vitamins minerals and protein and chlorophyll used by body buildersorganic wheat grass thought to be an effective healer especially for muscular and cellular repair provides minerals and vitamins protein and chlorophyll organic purple dulse seaweed contains lots of minerals thought to lower blood pressure and contribute to bone mineral densityorganic acerola cherry high in vitamins especially vitamin corganic rose hips also high in vitamin cpalm fruit for antioxidants vitamins and mineralsorganic lemon peel vitamins minerals and fiberorganic orange peel vitamins minerals and fiber thought to benefit the immune systemorganic beet root vitamins minerals and fiberorganic spinach leaf rich in antioxidants and vitamins a b2 c and k also contains magnesium manganese folate iron calcium and potassiumnonfermentable saccharomyces protein and b vitamins contains all nine essential amino acidscerevisiae good source of chromium for blood sugar levels and b vitamins considered a probiotic which may aid digestion,54.0,Dr. Schulze's,is discontinued by manufacturer no product dimensions 68 x 63 x 92 inches 14 ounces item model number dsmrf department health and personal care date first available june 23 2007 manufacturer dr schulzes
3,1.0,no safety seal,the pills did not have a safety seal,B004U3Y8NI,B004U3Y8NI,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,8,True,2021-07-08,14:45:44,Health & Personal Care,nature made vitamin d3 2000 iu 50 mcg dietary supplement for bone teeth muscle and immune health support 220 tablets 220 day supply,4.7,19904.0,nature made vitamin d3 2000 iu tablets 50 mcg offer an easy way to add vitamin d supplements to your daily routine to support bone teeth and muscle health sourced from high quality ingredients these nature made vitamin d tablets are gluten free dietary supplements with no color added and no artificial flavors each vitamin d3 2000iu tablet supports a healthy immune response and aids in calcium absorption vitamin d is a common nutrient shortfall d3 vitamin is the bodys preferred form of vitamin d to help maintain vit d levels these nature made vitamin d3 supplements contain 50mcg 2000 iu of vit d3 per serving adults take one vitamin d3 nature made tablet daily with water and a meal this nature made vitamin d supplement is quality you can trust usp has tested and verified ingredients potency and manufacturing process usp sets official standards for dietary supplements visit the usp verified website for more information based on a survey of pharmacists who recommend branded vitamins and supplements,15.36,Nature Made,brand nature made item form tablet primary supplement type vitamin d3vitamin d diet type gluten free flavor no artificial flavors is discontinued by manufacturer no product dimensions 244 x 244 x 438 inches 288 ounces item model number 2674 date first available march 25 2011 manufacturer nature made country of origin usa domestic shipping currently item can be shipped only within the us and to apofpo addresses for apofpo shipments please check with the manufacturer regarding warranty and support issues international shipping this item can be shipped to select countries outside of the us learn more
4,5.0,great product,best probiotic i have taken tend to have a sensitive stomach no issues since starting to take this i have tried many other probiotics,B00I3TFPCI,B019QLZOZY,AHITBJSS7KYUBVZPX7M2WJCOIVKQ,0,True,2015-05-23,00:55:10,Health & Personal Care,probiotics 30 billion cfu guaranteed 5 strains target digestive and immune health for women and men individually foil wrapped shelf stable gluten and dairy free nongmo 30 vegan capsules,4.4,1489.0,,21.97,Naturenetics,brand naturenetics item form capsule product benefits digestive health support age range description adult package information box unit count 30 count number of items 30 dosage form capsule material feature gmo free cruelty free vegan is discontinued by manufacturer no package dimensions 53 x 32 x 22 inches 16 ounces date first available november 1 2015 manufacturer naturenetics


# Software

In [18]:
# ------------------- Load and Sample Data -------------------

Software_review_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Software", trust_remote_code=True, split="full", streaming=True)
Software_meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Software", trust_remote_code=True, split="full", streaming=True)

# Get set amount of rows
s_rows_to_get = 20000
s_review_sample = list(islice(Software_review_ds, s_rows_to_get))
Software_meta_ds = Software_meta_ds.map(flatten_image_struct)
s_meta_sample = list(islice(Software_meta_ds, s_rows_to_get))
pd.set_option('display.max_rows', s_rows_to_get)
pd.set_option('display.max_colwidth', None)

# # Get all rows
# s_review_sample = list(Software_review_ds)
# Software_meta_ds = Software_meta_ds.map(flatten_image_struct)
# s_meta_sample = list(Software_meta_ds)

s_reviews_df = pd.DataFrame(s_review_sample)
s_meta_df = pd.DataFrame(s_meta_sample)

# ------------------- Clean Meta Dataset -------------------

clean_s_meta_df = s_meta_df.copy()

meta_cols_to_keep = [
    'parent_asin', 'main_category', 'product_title', 'average_rating', 'rating_number',
    'description', 'price', 'store', 'details'
]
clean_s_meta_df = clean_s_meta_df.rename(columns={'title': 'product_title'})
clean_s_meta_df = clean_s_meta_df.loc[:, meta_cols_to_keep]

# Drop incomplete entries
clean_s_meta_df = clean_s_meta_df.dropna(subset=['product_title', 'main_category', 'price']).copy()

# Clean 'description'
clean_s_meta_df.loc[:, 'description'] = clean_s_meta_df['description'].apply(
    lambda desc: clean_text(' '.join(desc)) if isinstance(desc, list)
    else clean_text(desc) if isinstance(desc, str)
    else 'No description available'
)

# Clean other fields
clean_s_meta_df.loc[:, 'details'] = clean_s_meta_df['details'].fillna('').astype(str).apply(clean_text)
clean_s_meta_df.loc[:, 'average_rating'] = clean_s_meta_df['average_rating'].fillna(0).astype(float)
clean_s_meta_df.loc[:, 'rating_number'] = clean_s_meta_df['rating_number'].fillna(0).astype(int)
clean_s_meta_df.loc[:, 'price'] = clean_s_meta_df['price'].apply(normalize_price)
clean_s_meta_df = clean_s_meta_df.dropna(subset=['price']).copy()
clean_s_meta_df.loc[:, 'store'] = clean_s_meta_df['store'].fillna('Unknown')
clean_s_meta_df.loc[:, 'product_title'] = clean_s_meta_df['product_title'].apply(clean_text)
clean_s_meta_df.loc[:, 'parent_asin'] = clean_s_meta_df['parent_asin'].astype(str)

clean_s_meta_df.reset_index(drop=True, inplace=True)

# ------------------- Clean Reviews Dataset -------------------

clean_s_reviews_df = s_reviews_df.copy()

if 'images' in clean_s_reviews_df.columns:
    clean_s_reviews_df = clean_s_reviews_df.drop(columns=['images'])

clean_s_reviews_df = clean_s_reviews_df.dropna(subset=['rating', 'text', 'asin', 'parent_asin']).copy()
clean_s_reviews_df = clean_s_reviews_df[clean_s_reviews_df['verified_purchase'] == True].copy()

clean_s_reviews_df.loc[:, 'title'] = clean_s_reviews_df['title'].fillna('').apply(clean_text)
clean_s_reviews_df.loc[:, 'text'] = clean_s_reviews_df['text'].apply(clean_text)
clean_s_reviews_df.loc[:, 'helpful_vote'] = clean_s_reviews_df['helpful_vote'].fillna(0).astype(int)

# Convert timestamp from milliseconds to seconds (remove milliseconds)
clean_s_reviews_df.loc[:, 'timestamp'] = (clean_s_reviews_df['timestamp'].astype('int64') // 1000)
converted_timestamps = pd.to_datetime(clean_s_reviews_df['timestamp'], unit='s', errors='coerce')
clean_s_reviews_df = clean_s_reviews_df.drop(columns=['timestamp'])
clean_s_reviews_df['timestamp'] = converted_timestamps

# Split timestamp into date and time
clean_s_reviews_df.loc[:, 'date'] = clean_s_reviews_df['timestamp'].dt.date
clean_s_reviews_df.loc[:, 'time'] = clean_s_reviews_df['timestamp'].dt.strftime('%H:%M:%S')

# Drop original timestamp column
clean_s_reviews_df.drop(columns=['timestamp'], inplace=True)

clean_s_reviews_df.loc[:, 'parent_asin'] = clean_s_reviews_df['parent_asin'].astype(str)
clean_s_reviews_df.rename(columns={'title': 'review_title'}, inplace=True)
clean_s_reviews_df.reset_index(drop=True, inplace=True)

# ------------------- Merge Datasets -------------------

clean_s_merged_df = pd.merge(clean_s_reviews_df, clean_s_meta_df, on='parent_asin', how='left')
clean_s_merged_df = clean_s_merged_df.dropna().reset_index(drop=True)

Save to CSV

In [19]:
# Create the CSV file
clean_s_merged_df.to_csv(f"{csv_folder_path}/Software.csv", index=False)

Load from CSV

In [20]:
# Load Gift Cards CSV file
software_df = pd.read_csv(f"{csv_folder_path}/Software.csv")

# Show file
software_df.head()

Unnamed: 0,rating,review_title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,date,time,main_category,product_title,average_rating,rating_number,description,price,store,details
0,4.0,fun game,one of my favorite games,B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,0,True,2019-06-20,20:10:28,Appstore for Android,gold fish casino slots free online slot machines,3.7,14717.0,experience the best wms bally slot games with gold fish free casino slots join your favorite goldfish goldie to get spinning winning on over 200 classic 777 vegas slot machines with more slots games added with every update plus with lots of free coins for new players and huge free daily casino bonusrelax win repeat with an ocean of the coolest free slots online download and play nowsplash into this underwater casino and discover amazing casino games with free las vegas slot machines from the best slots brands then move up the levels to unlock new slotstake a break from your daily routine and swim into the reels of classic 777 vegas casino games for free with over 200 free vegas slots adventures real 777 slot machine games free casino games and jackpot slots excitement is only a spin away unleash the lightning in the free to play slot machine kronos unleashed head into the wild with jungle wild ii with money burst free slots game hit hearts hearts hearts with casino floor favorite lock it link diamonds sail under the black for golden treasures in the high seas slots game cash falls pirates trove the great zeus slot will bestow big fortunes in this classic las vegas strip slot you can bet theres party mania at the bier haus free casino game double down on cash wizard slots and put some magic in your spins and coin in your bag wow lucky you grab your chips and win a fortune in diamond eternity have a yabbadabbadoo time and spin the flintstones slot a fan favorite take slots fun to another planet with invaders from the planet moolahwin massive daily free gifts coins and enjoy extra benefitsthere are so many ways to win with gold fish free slots for android you just have to experience them all for yourselfdownload today and enjoy the best casino slots payouts with these stunning features millions of free coins on first install over 200 incredible mobile slots to choose from daily bonus 500000 free coins every day you play plus multipliers receive free coins every 2 hours collect 1000000 mega bonus coins multilevel progressive jackpots win up to 100 billion coins send and receive gifts from your friends get millions in gift bonuses daily play hot 2019 android casino games with wild respins free connect to facebook for fun 777 casino bonusmake new friends get social with gold fish casino slotsjoin our community of half a million players who live eat and breathe gold fish casino on our community page wwwfacebookcomgoldfishcasinoslots and on our instagram page wwwinstagramcomgoldfishcasinoslotsgold fish casino slots is a playforfun casino that is intended for amusement onlyall ingame sales are finalthe games are intended for an adult audience ie intended for use by those 21 or olderthe games do not offer real money gambling or an opportunity to win real money or prizespractice or success at social casino gaming does not imply future success at real money gambling,0.0,SG Interactive,release date 2014 date first listed on amazon may 22 2014 developed by sg interactive size 933mb version 3900 application permissions access information about networks access information about wifi networks foregroundservice get information about the currently or recently running tasks a thumbnail representation of the tasks what activities are running in it etc open network sockets postnotifications readappbadge get notified that the operating system has finished booting powermanager wakelocks to keep processor from sleeping or screen from dimming allows sending inapp billing requests and managing inapp billing transactions allows an application to receive messages via google cloud messaging receive message via amazon device comgoogleandroidgmspermissionadid minimum operating system android 44 approximate download time more than 5 minutes
1,5.0,my favorite showvoice my favorite show,see the voice anytime my my favorite show,B018IOV40E,B018IOV40E,AEJDETWITK2KGACH7FUBMY33PPSQ,0,True,2018-03-07,20:56:00,Appstore for Android,nbc,3.9,244054.0,watch brandnew episodes of current nbc tv shows and classic hits and live stream national and local news anytime anywhere catch your favorite nbc shows including this is us law order svu and saturday night live the day after they air on tv the nbc app keeps you up to date with the hottest shows and latest newswatch series from across the nbcuniversal family of networks including bravo e oxygen syfy usa and more stream hit shows from beginning to end all in one placecatch up on episodes favorite and save new and classic tv shows and continue watching the latest series across all your devices when you log in with your nbcuniversal profilestream tv episodes movies and clips watch the latest episodes of brandnew nbc series the day after they air and classic tv shows you know and love for free log in with a tv provider to unlock even more stream movies and tv shows including hundreds of new episodes by linking to your tv providernbc live stream watch tv shows via the nbc live stream now available in most markets nationwide by linking to your tv service provider stream live news local or nationalstream episodes of classic tv shows stream timeless episodes of throwback tv shows for free rewatch your favorite throwback tv series like heroes saved by the bell will grace and morenbcuniversal profile create an nbcuniversal profile to save and continue watching tv shows across all your devices receive credits to unlock three episodes for freefor any questions or concerns about the app please see our help pages or contact nbc support at amazonfirenbcdigitalzendeskcomend user license agreement eula by using or downloading nbcuniversal media llcs andor its affiliated companies applications content games interactive tv other products andor online services nbcuniversal services you agree to be bound by the nbcuniversal privacy policy at httpswwwnbcuniversalcomprivacyintakenbcentertainment and terms of use tou at wwwnbccomgeneralpagesterms as updated from time to time including but not limited to your agreement to arbitrate any disputes with us and to waive your rights to jury trial and class actions as more fully set forth at wwwnbccomgeneralpagesterms do not download or use the services if you do not agree to the tou or privacy policyprivacy policy httpswwwnbcuniversalcomprivacyintakenbcentertainmentyour privacy choices httpswwwnbcuniversalcomprivacynotrtoointakenbcentertainmentca notice httpswwwnbcuniversalcomprivacycaliforniaconsumerprivacyactintakenbcentertainment,0.0,"NBCUniversal Media, LLC",release date 2015 date first listed on amazon december 14 2015 developed by nbcuniversal media llc size 398mb version 920 application permissions access coarse eg cellid wifi location access location information from the device access information about networks access information about wifi networks foregroundservice open network sockets postnotifications read from external storage get notified that the operating system has finished booting powermanager wakelocks to keep processor from sleeping or screen from dimming write to external storage allows an application to receive messages via google cloud messaging receive message via amazon device comgoogleandroidgmspermissionadid minimum operating system android 50 approximate download time less than 5 minutes
2,5.0,abs,yes planning my 30 day tomorrow,B00W7C12F6,B00W7C12F6,AEOKAIB6OJVB5FRZGLEB3YAK5UBQ,5,True,2018-08-08,03:04:54,Appstore for Android,30 day abs workout challenge,3.6,376.0,start the 30 day abs workout challenge to burn fat and get abs the app contains ten common abs exercises everyone can do doing the workout will make you feel stronger healthier improve energy and control weight perform 10 exercises everyday to get 6 pack abs exercise benefits every part of the body you dont need any equipment for the workout complete daily ab workout with log there are three difficulty levels you can choose fromfeatures free and simple 10 abs exercises workout log workout difficulty levels set reminder animations 30 days challengethe daily workout routine contains basic crunches alternate heel touches legup crunches arm reaching crunches flutter kicks leg raises v crunches bicycle crunches vertical leg crunches and plank with leg liftif you dont train this is the perfect app for you get abdominal muscles with this fitness app do it everyday to get fast results,0.0,PlaySimple,release date 2015 date first listed on amazon april 16 2015 developed by playsimple size 38mb version 10 application permissions access information about networks access information about wifi networks open network sockets read from external storage write to external storage minimum operating system android 233 approximate download time less than 30 seconds
3,5.0,five stars,my favorite music app,B00KLBR6IC,B00KLBR6IC,AEOKAIB6OJVB5FRZGLEB3YAK5UBQ,0,True,2018-05-12,10:15:47,Appstore for Android,spotify music,4.5,139620.0,spotify is now free on mobile and tablet listen to the right music wherever you are with spotify you have access to a world of music you can listen to artists and albums or create your own playlist of your favorite songs want to discover new music choose a readymade playlist that suits your mood or get personalized recommendationslisten for free on mobile play any artist album or playlist on shuffle modelisten for free on tablet play any song any timespotify premium features play any song any time on any devicemobile tablet or your computer download music for offline listening enjoy amazing sound quality no ads just uninterrupted music no commitment cancel any time you likelove spotify like us on facebook httpwwwfacebookcomspotify follow us on twitter httptwittercomspotify,0.0,Spotify AB,release date 2014 date first listed on amazon may 28 2014 developed by spotify ab size 517mb version 8828409 application permissions access information about networks access information about wifi networks connect to paired bluetooth devices discover and pair bluetooth devices bluetoothadvertise bluetoothconnect bluetoothscan broadcast sticky intents required to be able to access the camera device enter wifi multicast mode foregroundservice foregroundserviceconnecteddevice foregroundservicemediaplayback access the list of accounts in the accounts service open network sockets modify global audio settings directly communicate over nfc postnotifications read from external storage readmediaaudio readmediaimages get notified that the operating system has finished booting record audio requestcompanionruninbackground request authtokens from the accountmanager access the vibration feature powermanager wakelocks to keep processor from sleeping or screen from dimming write to external storage allows installation of home screen shortcuts allows sending inapp billing requests and managing inapp billing transactions comgoogleandroidappsmeetingspermissionmeetlivesharing allows an application to receive messages via google cloud messaging receive message via amazon device comgoogleandroidgmspermissionadid minimum operating system android 50 approximate download time more than 5 minutes
4,3.0,discovery plus,the app works but in general i just dont think there is enough content to justify the additional streaming costs i havent found a single show that would make me think i needed to pay more plus i dont really appreciate that they make you pay to see some shows new andor reruns eh i already pay way enough for my satellite subscription so once my trial use is over ill cancel,B08F8XQLQZ,B08F8XQLQZ,AFZUK3MTBIBEDQOPAK3OATUOUKLA,0,True,2021-05-06,05:01:43,Appstore for Android,discovery stream tv shows originals and more,4.4,251677.0,discovery is the streaming home of food home relationships true crime paranormal and so much more watch the best reallife shows from your favorite tv brands including hgtv food network tlc id magnolia network animal planet discovery channel and many more plus catch exciting cantmiss originals and exclusives you wont see anywhere else theres something for everyone subscribe nowwith discovery you get your favorite shows and personalities from the best tv brands including hgtv food network tlc id magnolia network animal planet discovery channel and more mustsee 90 day fianc moments with 200 hours of exclusive new series and more exclusive discovery originals cantmiss exciting new series you wont see anywhere else 70000 episodes and 2500 shows more added all the time an adfree plan option so you can enjoy your favorites without commercial interruption awardwinning collection of the best natural history series from discovery and bbc including planet earth blue planet and frozen planet fresh new episodes and series added all the time from lifestyle and true crime to home improvement food adventure and more your favorite personalities like the irwins jonathan and drew scott chip and joanna gaines bobby flay ree drummond guy fieri giada de laurentiis lieutenant joe kenda and so many more full access to watch anytime anywhere on your mobile device tablet computer game console and connected tvsubscribe nowpayment automatically renews unless you cancel your account before the end of the current subscription period you can manage your subscription or cancel anytime by accessing your account discovery is available to customers in the us and us territories onlyvisitor agreement httpsdiscoverypluscomtermsprivacy notice httpscorporatediscoverycomprivacypolicycalifornia privacy notice httpscorporatediscoverycomprivacypolicycappicalifornia do not sell my personal info httpscorporatediscoverycomcaliforniadnscustomer help httpshelpdiscoverypluscomwe may work with thirdparty advertising companies and other partners that help us deliver tailored advertisements to you to optout of behavioral tracking on mobile devices you may use the daa appchoices tool available at httpwwwaboutadsinfoappchoicesthis app features nielsens proprietary measurement software which will allow you to contribute to market research such a nielsen tv ratings to opt out of nielsen measurement on mobile devices visit nielsens mobile optout page at httpssitesnielsencomlegalprivacystatementnielsenappoutput,0.0,Discovery Communications,release date 2020 date first listed on amazon september 14 2020 developed by discovery communications size 284mb version 1764 application permissions access information about networks access information about wifi networks foregroundservice open network sockets read from external storage read only access to device state get notified that the operating system has finished booting record audio powermanager wakelocks to keep processor from sleeping or screen from dimming write to external storage comandroidproviderstvpermissionreadepgdata comandroidproviderstvpermissionwriteepgdata allows sending inapp billing requests and managing inapp billing transactions comgoogleandroidgmspermissionadid minimum operating system android 51 approximate download time less than 4 minutes
