# QA Dataset

In [None]:
# Load QA dataset subset (Electronics)

!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon/qa/qa_Electronics.json.gz

--2024-10-22 15:25:19--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon/qa/qa_Electronics.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34511692 (33M) [application/x-gzip]
Saving to: ‘qa_Electronics.json.gz’


2024-10-22 15:25:20 (65.0 MB/s) - ‘qa_Electronics.json.gz’ saved [34511692/34511692]



In [None]:
# This data loading code is provided with the dataset information:
# https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/qa/

import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

QA_electronics = getDF('/content/qa_Electronics.json.gz')

In [None]:
QA_electronics.head()

Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer
0,yes/no,594033926,"Dec 27, 2013",1388131000.0,Is this cover the one that fits the old nook c...,Y,Yes this fits both the nook color and the same...
1,yes/no,594033926,"Jan 5, 2015",1420445000.0,Does it fit Nook GlowLight?,N,No. The nook color or color tablet
2,open-ended,594033926,2 days ago,,Would it fit Nook 1st Edition? 4.9in x 7.7in ?,,I don't think so. The nook color is 5 x 8 so n...
3,yes/no,594033926,17 days ago,,Will this fit a Nook Color that's 5 x 8?,Y,yes
4,yes/no,594033926,"Feb 10, 2015",1423555000.0,will this fit the Samsung Galaxy Tab 4 Nook 10.1,N,"No, the tab is smaller than the 'color'"


In [None]:
# Unique products included in the dataset

len(QA_electronics['asin'].unique())

39371

In [None]:
# Mean, min and max number of questions per product

counts = QA_electronics['asin'].value_counts()
mean_questions = counts.mean()
min_questions = counts.min()
max_questions = counts.max()
print(mean_questions, min_questions, max_questions)

7.982093419013995 1 40


In [None]:
# Number of questions of each type

QA_electronics['questionType'].value_counts()

Unnamed: 0_level_0,count
questionType,Unnamed: 1_level_1
yes/no,165598
open-ended,148665


In [None]:
# Exemple questions of each type

QA_electronics[QA_electronics['questionType'] == 'yes/no'].sample(frac=1).reset_index(drop=True).head(10)

Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer
0,yes/no,B009GMO3FQ,"Sep 14, 2014",1410678000.0,would this fit a chromebook acer c720?,Y,"If it is 11"" it should fit. The case doesn't l..."
1,yes/no,B006U0Z0O0,"Jul 20, 2013",1374304000.0,Do they come with a neck strap/Can I attach a ...,Y,"Yes, strap is included."
2,yes/no,B0031ERPQY,"Mar 13, 2014",1394694000.0,"will this work off laptop, to display bingo ga...",?,you will need a converter that i picked up for...
3,yes/no,B004EFLMAE,"Jun 27, 2014",1403852000.0,"My ISP is Comcast, does this work with Comcast?",?,This product does not have an integrated modem...
4,yes/no,B004RG4IIS,"Jan 27, 2014",1390810000.0,When using the velcro attachment on the swivel...,?,No it only fits when the swivel LCD is out.
5,yes/no,B004Q79BIA,"Jul 10, 2013",1373440000.0,Do you have to buy a battery charger?,N,"No, there is one in the package, and also you ..."
6,yes/no,B004L6MLPQ,"Sep 15, 2013",1379228000.0,"I have a 15"" MacBook Pro w/ retina display fro...",Y,Yes it will work with all mid-2009 MacBooks
7,yes/no,B00DSUTVK4,14 days ago,,Is this Hard drive compatible with Dell studio...,Y,yes it is
8,yes/no,B009SJR2B4,"Feb 4, 2015",1423037000.0,Will this work with the Xbox One?,?,As long as the Xbox has a 3.5mm input jack for...
9,yes/no,B004J3Y9U6,"Dec 2, 2012",1354435000.0,does this camera take decent action shots?,?,I have two children under five-years-old and t...


In [None]:
QA_electronics[QA_electronics['questionType'] == 'open-ended'].sample(frac=1).reset_index(drop=True).head(10)

Unnamed: 0,questionType,asin,answerTime,unixTime,question,answerType,answer
0,open-ended,B00153EYTO,"Mar 29, 2015",1427612000.0,Does the transfer switch has switched neutral ...,,Neutral is common with house. Its basic groind...
1,open-ended,B0013A12H6,"May 19, 2009",1242716000.0,What are the display speeds available on the S...,,FWIW the times listed to change pictures are: ...
2,open-ended,B001FA1NDC,"Nov 4, 2008",1225786000.0,Any way to turn cover flow off? Can you disabl...,,I just spoke with an Apple rep as I am also fr...
3,open-ended,B004X69BFW,"Feb 22, 2013",1361520000.0,What are the weather display features?,,There are none. This essentially is a standard...
4,open-ended,B00IG4JB8Y,"Mar 16, 2015",1426489000.0,"what is the warranty on these, mine stopped wo...",,"The warranty came in the box, you can read it ..."
5,open-ended,B00133RCCC,"Apr 4, 2015",1428131000.0,Is this what I need for a 2005 Toyota Camry th...,,I don't think you need this kit for your 2005 ...
6,open-ended,B003ZTIANC,"Feb 18, 2014",1392710000.0,Will this Hard Drive work with my Acer Aspire ...,,"its 2.5, it will fit most laptops is all I rea..."
7,open-ended,B002UNFVK4,"Aug 7, 2013",1375859000.0,how do you correct keys that don't respond (wo...,,I freak out and cuss! Even though it shouldn't...
8,open-ended,B004T163SY,"Jan 17, 2014",1389946000.0,overheat,,mine is inside my fairing on a harley street g...
9,open-ended,B00009WBYL,"Nov 25, 2013",1385366000.0,"Dv-62clrs as fronts for old receiver, music only",,I don't see why not. The main difference betwe...


In [None]:
QA_electronics = Dataset.from_pandas(QA_electronics)

In [None]:
QA_electronics

# Reviews Dataset

In [None]:
!pip install datasets
from datasets import load_dataset

reviews_electronics = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", trust_remote_code=True)



Electronics.jsonl:   0%|          | 0.00/22.6G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/34 [00:00<?, ?it/s]

In [None]:
reviews_electronics['full']

Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 43886944
})

In [None]:
reviews_electronics['full'][:10]['text']

['First & most offensive: they reek of gasoline so if you are sensitive/allergic to petroleum products like I am you will want to pass on these.  Second: the phone adapter is useless as-is. Mine was not drilled far enough to be able to tighten it into place for my iPhone 12 max. It just slipped & slid all over. Stupid me putting the adapter together first without picking up the binoculars to smell them bc I wasted 15 minutes trying to figure out how to put the adapter together bc it does not come with instructions!  I had to come back here to the website which was a total pain. Third: the tripod is also useless. I would not trust the iOS to hold my $1600 phone nor even a Mattel Barbie for that matter. It’s just inefficient for the job imo.  Third: in order to try to give an honest review I did don gloves & eyewear to check the binoculars out.  They seemed average except for mine seemed to be missing about 10% of the film costing in the lower edge of one of the lenses which would have r

# Check number of mutual products

In [None]:
# Product IDs present in both QA and review datasets

ids_QA = set(QA_electronics['asin'])
ids_reviews = set(reviews_electronics['full']['asin'])

common_ids = ids_QA.intersection(ids_reviews)
len(common_ids)

29420

In [None]:
from collections import Counter

counter_QA = Counter(QA_electronics['asin'])
counter_reviews = Counter(reviews_electronics['full']['asin'])

In [None]:
QA_num_per_product = []
reviews_num_per_product = []

for asin in common_ids:
  freq_QA = counter_QA[asin]
  freq_reviews = counter_reviews[asin]

  QA_num_per_product.append(freq_QA)
  reviews_num_per_product.append(freq_reviews)

In [None]:
# Mean number of questions and mean number of reviews for the same product

import numpy as np

np.mean(QA_num_per_product), np.mean(reviews_num_per_product)

(8.04887831407206, 194.1732154996601)