# Dataset Collection
Original dataset found on https://nijianmo.github.io/amazon/index.html

We choose all product categories with at least 25,000 reviews


In [1]:
categories = [
    "Arts, Crafts and Sewing",
    "Automotive",
    "Books",
    "CDs and Vinyl",
    "Cell Phones and Accessories",
    "Clothing, Shoes and Jewelry",
    "Digital Music",
    "Electronics",
    "Grocery and Gourmet Food",
    "Home and Kitchen",
    "Industrial and Scientific",
    "Kindle Store",
    "Luxury Beauty",
    "Movies and TV",
    "Musical Instruments",
    "Office Products",
    "Patio, Lawn and Garden",
    "Pet Supplies",
    "Prime Pantry",
    "Sports and Outdoors",
    "Tools and Home Improvement",
    "Toys and Games",
    "Video Games"
]

In [2]:
def generate_file_names(categories):
  files = []
  for category in categories:
    category_name = category.replace(' ', '_').replace(',', '')
    file_name = f"{category_name}_5.json.gz"
    files.append(file_name)
  return files

In [3]:
def generate_urls(file_names):
    base_url = "https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/"
    urls = []
    for file_name in file_names:
        url = f"{base_url}{file_name}"
        urls.append(url)
    return urls

In [4]:
file_names = generate_file_names(categories)
file_names

['Arts_Crafts_and_Sewing_5.json.gz',
 'Automotive_5.json.gz',
 'Books_5.json.gz',
 'CDs_and_Vinyl_5.json.gz',
 'Cell_Phones_and_Accessories_5.json.gz',
 'Clothing_Shoes_and_Jewelry_5.json.gz',
 'Digital_Music_5.json.gz',
 'Electronics_5.json.gz',
 'Grocery_and_Gourmet_Food_5.json.gz',
 'Home_and_Kitchen_5.json.gz',
 'Industrial_and_Scientific_5.json.gz',
 'Kindle_Store_5.json.gz',
 'Luxury_Beauty_5.json.gz',
 'Movies_and_TV_5.json.gz',
 'Musical_Instruments_5.json.gz',
 'Office_Products_5.json.gz',
 'Patio_Lawn_and_Garden_5.json.gz',
 'Pet_Supplies_5.json.gz',
 'Prime_Pantry_5.json.gz',
 'Sports_and_Outdoors_5.json.gz',
 'Tools_and_Home_Improvement_5.json.gz',
 'Toys_and_Games_5.json.gz',
 'Video_Games_5.json.gz']

In [5]:
download_urls = generate_urls(file_names)
download_urls

['https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Automotive_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Books_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/CDs_and_Vinyl_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Clothing_Shoes_and_Jewelry_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Digital_Music_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Electronics_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Home_and_Kitchen_5.json.gz',
 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Industrial_and_Scient

In [6]:
for url in download_urls:
  !wget --no-check-certificate $url

--2023-11-30 02:19:05--  https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Arts_Crafts_and_Sewing_5.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 53965563 (51M) [application/x-gzip]
Saving to: ‘Arts_Crafts_and_Sewing_5.json.gz’


2023-11-30 02:19:05 (101 MB/s) - ‘Arts_Crafts_and_Sewing_5.json.gz’ saved [53965563/53965563]

--2023-11-30 02:19:06--  https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Automotive_5.json.gz
Resolving jmcauley.ucsd.edu (jmcauley.ucsd.edu)... 137.110.160.73
Connecting to jmcauley.ucsd.edu (jmcauley.ucsd.edu)|137.110.160.73|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 202890818 (193M) [application/x-gzip]
Saving to: ‘Automotive_5.json.gz’


2023-11

In [7]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

In [8]:
def construct_df(file_name):
  data = []
  lines_to_read = 50000  # read in at most 50,000 rows from each file
  with gzip.open(file_name) as f:
    for l in f:
      if lines_to_read <= 0:
        break
      data.append(json.loads(l.strip()))
      lines_to_read -= 1
  return pd.DataFrame.from_dict(data)

In [9]:
dataframes = [construct_df(file_name) for file_name in file_names]

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [21]:
len(dataframes)

23

In [22]:
train_dfs = []
dev_dfs = []
test_dfs = []

In [23]:
for df in dataframes:
    # drop votes, images, and style since they have a lot of null values
    df = df.drop(['vote', 'image', 'style'], axis=1)
    # drop any rows that have null values
    df = df.dropna()
    # drop any duplicate reviews
    df = df.drop_duplicates(['asin','reviewerID', 'summary'])
    # only sample 25,000 rows from each dataframe to keep an even distribution
    df = df.sample(n=25000, random_state=42).reset_index(drop=True)
    train, temp = train_test_split(df, test_size=0.2, random_state=42)
    dev, test = train_test_split(temp, test_size=0.5, random_state=42)
    train_dfs.append(train)
    dev_dfs.append(dev)
    test_dfs.append(test)

In [24]:
train_data = pd.concat(train_dfs)
dev_data = pd.concat(dev_dfs)
test_data = pd.concat(test_dfs)

In [42]:
train_data = train_data.reset_index(drop=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460000 entries, 0 to 459999
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         460000 non-null  float64
 1   verified        460000 non-null  bool   
 2   reviewTime      460000 non-null  object 
 3   reviewerID      460000 non-null  object 
 4   asin            460000 non-null  object 
 5   reviewerName    460000 non-null  object 
 6   reviewText      460000 non-null  object 
 7   summary         460000 non-null  object 
 8   unixReviewTime  460000 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 28.5+ MB


In [43]:
dev_data = dev_data.reset_index(drop=True)
dev_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57500 entries, 0 to 57499
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         57500 non-null  float64
 1   verified        57500 non-null  bool   
 2   reviewTime      57500 non-null  object 
 3   reviewerID      57500 non-null  object 
 4   asin            57500 non-null  object 
 5   reviewerName    57500 non-null  object 
 6   reviewText      57500 non-null  object 
 7   summary         57500 non-null  object 
 8   unixReviewTime  57500 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 3.6+ MB


In [44]:
test_data = test_data.reset_index(drop=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57500 entries, 0 to 57499
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         57500 non-null  float64
 1   verified        57500 non-null  bool   
 2   reviewTime      57500 non-null  object 
 3   reviewerID      57500 non-null  object 
 4   asin            57500 non-null  object 
 5   reviewerName    57500 non-null  object 
 6   reviewText      57500 non-null  object 
 7   summary         57500 non-null  object 
 8   unixReviewTime  57500 non-null  int64  
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 3.6+ MB


In [46]:
train_data.to_csv('train.csv')
dev_data.to_csv('dev.csv')
test_data.to_csv('test.csv')

In [47]:
sample_data = train_data.sample(n=10).reset_index(drop=True)

In [48]:
sample_data.to_csv('sample.csv')