# Notebook Purpose
For confirming that downloaded datasets are consistent with the baseline paper and exploring the contents of said datasets.

In [1]:
# imports
import pandas as pd

# Amazon 1996-2014 Toys and Games Review Data

This dataset contains a list of reviews, each with the following metadata:
- reviewerID: the ID of the user
- asin: the ID of the product reviewed
- reviewerName: the username of the user
- helpful: the helpfulness rating, expressed as [helpful, unhelpful]
- reviewText: body text of the review
- overall: rating of the review /5
- summary: the header (summary text) of the review
- unixReviewTime: time in unix
- reviewTime: raw time recorded

In [2]:
amazon_5core_path = '../data/datasets/reviews_Toys_and_Games_5.json'
amazon_df = pd.read_json(amazon_5core_path, lines = True)

In [3]:
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167597 entries, 0 to 167596
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   reviewerID      167597 non-null  object
 1   asin            167597 non-null  object
 2   reviewerName    166759 non-null  object
 3   helpful         167597 non-null  object
 4   reviewText      167597 non-null  object
 5   overall         167597 non-null  int64 
 6   summary         167597 non-null  object
 7   unixReviewTime  167597 non-null  int64 
 8   reviewTime      167597 non-null  object
dtypes: int64(2), object(7)
memory usage: 11.5+ MB


# Yelp 2018 Las Vegas

This dataset contains information from the 2018 Yelp challenge. The dataset contains 5 json folders and over 20 GB cumulative of data. 

In [4]:
yelp2018_business_path = '../data/datasets/yelp2018/yelp_academic_dataset_business.json'
yelp2018_checkin_path = '../data/datasets/yelp2018/yelp_academic_dataset_checkin.json'
yelp2018_review_path = '../data/datasets/yelp2018/yelp_academic_dataset_review.json'
yelp2018_tip_path = '../data/datasets/yelp2018/yelp_academic_dataset_tip.json'
yelp2018_user_path = '../data/datasets/yelp2018/yelp_academic_dataset_user.json'
yelp_business_df = pd.read_json(yelp2018_business_path, lines = True)
yelp_checkin_df = pd.read_json(yelp2018_checkin_path, lines = True)
yelp_review_df = pd.read_json(yelp2018_review_path, lines = True)
yelp_tip_df = pd.read_json(yelp2018_tip_path, lines = True)
yelp_user_df = pd.read_json(yelp2018_user_path, lines = True)

In [6]:
display(yelp_business_df.info())
display(yelp_checkin_df.info())
display(yelp_review_df.info())
display(yelp_tip_df.info())
display(yelp_user_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131930 entries, 0 to 131929
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  131930 non-null  object
 1   date         131930 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908915 entries, 0 to 908914
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_id           908915 non-null  object        
 1   business_id       908915 non-null  object        
 2   text              908915 non-null  object        
 3   date              908915 non-null  datetime64[ns]
 4   compliment_count  908915 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 34.7+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 333.7+ MB


None