# Dataset Collection
Original dataset found on https://nijianmo.github.io/amazon/index.html

We choose all product categories listed


In [None]:
categories = [
    "AMAZON FASHION",
    "All Beauty",
    "Appliances",
    "Arts, Crafts and Sewing",
    "Automotive",
    "Books",
    "CDs and Vinyl",
    "Cell Phones and Accessories",
    "Clothing, Shoes and Jewelry",
    "Digital Music",
    "Electronics",
    "Gift Cards",
    "Grocery and Gourmet Food",
    "Home and Kitchen",
    "Industrial and Scientific",
    "Kindle Store",
    "Luxury Beauty",
    "Magazine Subscriptions",
    "Movies and TV",
    "Musical Instruments",
    "Office Products",
    "Patio, Lawn and Garden",
    "Pet Supplies",
    "Prime Pantry",
    "Software",
    "Sports and Outdoors",
    "Tools and Home Improvement",
    "Toys and Games",
    "Video Games"
]

In [None]:
def generate_file_names(categories):
  files = []
  for category in categories:
    category_name = category.replace(' ', '_').replace(',', '')
    file_name = f"{category_name}_5.json.gz"
    files.append(file_name)
  return files

In [None]:
def generate_urls(file_names):
    base_url = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/"
    urls = []
    for file_name in file_names:
        url = f"{base_url}{file_name}"
        urls.append(url)
    return urls

In [None]:
file_names = generate_file_names(categories)
file_names

['AMAZON_FASHION_5.json.gz',
 'All_Beauty_5.json.gz',
 'Appliances_5.json.gz',
 'Arts_Crafts_and_Sewing_5.json.gz',
 'Automotive_5.json.gz',
 'Books_5.json.gz',
 'CDs_and_Vinyl_5.json.gz',
 'Cell_Phones_and_Accessories_5.json.gz',
 'Clothing_Shoes_and_Jewelry_5.json.gz',
 'Digital_Music_5.json.gz',
 'Electronics_5.json.gz',
 'Gift_Cards_5.json.gz',
 'Grocery_and_Gourmet_Food_5.json.gz',
 'Home_and_Kitchen_5.json.gz',
 'Industrial_and_Scientific_5.json.gz',
 'Kindle_Store_5.json.gz',
 'Luxury_Beauty_5.json.gz',
 'Magazine_Subscriptions_5.json.gz',
 'Movies_and_TV_5.json.gz',
 'Musical_Instruments_5.json.gz',
 'Office_Products_5.json.gz',
 'Patio_Lawn_and_Garden_5.json.gz',
 'Pet_Supplies_5.json.gz',
 'Prime_Pantry_5.json.gz',
 'Software_5.json.gz',
 'Sports_and_Outdoors_5.json.gz',
 'Tools_and_Home_Improvement_5.json.gz',
 'Toys_and_Games_5.json.gz',
 'Video_Games_5.json.gz']

In [None]:
download_urls = generate_urls(file_names)
download_urls

['https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Appliances_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Automotive_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/CDs_and_Vinyl_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Clothing_Shoes_and_Jewelry_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Digital_Music_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Electronics_5.json.gz',
 'https://jmca

In [None]:
for url in download_urls:
  !wget --no-check-certificate $url

--2023-12-19 21:09:00--  https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/AMAZON_FASHION_5.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 287013 (280K) [application/x-gzip]
Saving to: ‘AMAZON_FASHION_5.json.gz’


2023-12-19 21:09:01 (1.04 MB/s) - ‘AMAZON_FASHION_5.json.gz’ saved [287013/287013]

--2023-12-19 21:09:01--  https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/All_Beauty_5.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 633507 (619K) [application/x-gzip]
Saving to: ‘All_Beauty_5.json.gz’


2023-12-19 21:09:02 (1.83 MB/s) - ‘All

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

In [None]:
def construct_df(file_name):
  data = []
  lines_to_read = 50000  # read in first 50,000 rows from each file
  with gzip.open(file_name) as f:
    for l in f:
      if lines_to_read <= 0:
        break
      data.append(json.loads(l.strip()))
      lines_to_read -= 1
  return pd.DataFrame.from_dict(data)

In [None]:
dataframes = [construct_df(file_name) for file_name in file_names]

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
len(dataframes)

29

In [None]:
train_dfs = []
dev_dfs = []
test_dfs = []

In [None]:
for df in dataframes:
    # drop votes, images, and style since they have a lot of null values
    df = df.drop(['vote', 'image', 'style'], axis=1)
    df = df.dropna()
    df = df.drop_duplicates(['summary'])
    if len(df) >= 500:
      df = df.sample(n=500, random_state=42).reset_index(drop=True)
    train, temp = train_test_split(df, test_size=0.2, random_state=42)
    dev, test = train_test_split(temp, test_size=0.5, random_state=42)
    train_dfs.append(train)
    dev_dfs.append(dev)
    test_dfs.append(test)

In [None]:
train_data = pd.concat(train_dfs)
dev_data = pd.concat(dev_dfs)
test_data = pd.concat(test_dfs)

In [None]:
train_data = train_data.reset_index(drop=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11095 entries, 0 to 11094
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         11095 non-null  float64
 1   verified        11095 non-null  bool   
 2   reviewTime      11095 non-null  object 
 3   reviewerID      11095 non-null  object 
 4   asin            11095 non-null  object 
 5   reviewerName    11095 non-null  object 
 6   reviewText      11095 non-null  object 
 7   summary         11095 non-null  object 
 8   unixReviewTime  11095 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 704.4+ KB


In [None]:
dev_data = dev_data.reset_index(drop=True)
dev_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1387 entries, 0 to 1386
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         1387 non-null   float64
 1   verified        1387 non-null   bool   
 2   reviewTime      1387 non-null   object 
 3   reviewerID      1387 non-null   object 
 4   asin            1387 non-null   object 
 5   reviewerName    1387 non-null   object 
 6   reviewText      1387 non-null   object 
 7   summary         1387 non-null   object 
 8   unixReviewTime  1387 non-null   int64  
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 88.2+ KB


In [None]:
test_data = test_data.reset_index(drop=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1388 entries, 0 to 1387
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         1388 non-null   float64
 1   verified        1388 non-null   bool   
 2   reviewTime      1388 non-null   object 
 3   reviewerID      1388 non-null   object 
 4   asin            1388 non-null   object 
 5   reviewerName    1388 non-null   object 
 6   reviewText      1388 non-null   object 
 7   summary         1388 non-null   object 
 8   unixReviewTime  1388 non-null   int64  
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 88.2+ KB


In [None]:
train_data.to_csv('train.csv')
dev_data.to_csv('dev.csv')
test_data.to_csv('test.csv')