## Downloading raw data

In [None]:
import tarfile

# Path to the .tar file
# this is obtained from yelp directly
tar_file_path = 'yelp_dataset.tar'

# Open the tar file
with tarfile.open(tar_file_path, 'r') as tar:
    # Extract all files in the current directory (or specify a path)
    tar.extractall()  # Extract to 'extracted_files' folder

print("Extraction Complete!")

Extraction Complete!


## EDA

In [24]:
import pandas as pd
import json

### business data

In [33]:
data_file = open("yelp_academic_dataset_business.json")
data = []
for line in data_file:
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [34]:
business_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


In [35]:
business_df.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."


In [47]:
# create a copy
business_cleaned_df = business_df.copy()

# Standardize the city column using regex (remove extra spaces and capitalizing words)
business_cleaned_df["city"] = business_cleaned_df["city"].str.replace(r"\s+", " ", regex=True).str.strip().str.title()

# drop unnecessary columns
business_cleaned_df.drop(["stars", "latitude", "longitude", "review_count", "attributes"],
                         axis=1,
                         inplace=True)

# drop business with state that does not belong to USA
abbreviations = [
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States.
    "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "IA",
    "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO",
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK",
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI",
    "WV", "WY",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district.
    "DC",
    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories.
    "AS", "GU", "MP", "PR", "VI",
]
business_cleaned_df = business_cleaned_df[business_cleaned_df['state'].isin(abbreviations)]

# drop any null row
business_cleaned_df.dropna(axis=0, inplace=True)

# Drop and reset index
business_cleaned_df.reset_index(drop=True, inplace=True)

In [48]:
# final schema
business_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122577 entries, 0 to 122576
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  122577 non-null  object
 1   name         122577 non-null  object
 2   address      122577 non-null  object
 3   city         122577 non-null  object
 4   state        122577 non-null  object
 5   postal_code  122577 non-null  object
 6   is_open      122577 non-null  int64 
 7   categories   122577 non-null  object
 8   hours        122577 non-null  object
dtypes: int64(1), object(8)
memory usage: 8.4+ MB


In [52]:
business_cleaned_df.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,is_open,categories,hours
0,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,1,"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
1,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,0,"Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."


### checkin data

In [49]:
data_file = open("yelp_academic_dataset_checkin.json")
data = []
for line in data_file:
    data.append(json.loads(line))
checkin_df = pd.DataFrame(data)
data_file.close()

In [50]:
checkin_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131930 entries, 0 to 131929
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  131930 non-null  object
 1   date         131930 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [51]:
checkin_df.head(2)

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."


In [62]:
# Create a copy of checkin_df called  checkin_cleaned_df
checkin_cleaned_df = checkin_df.copy()

# impose the foreign key constraint on business_id
checkin_cleaned_df = checkin_cleaned_df[checkin_cleaned_df["business_id"].isin(business_cleaned_df["business_id"].to_list())]

# explode date so that each date of a business takes a single row
checkin_cleaned_df["date"] = checkin_cleaned_df["date"].str.split(', ')
checkin_cleaned_df = checkin_cleaned_df.explode("date")

# sample 200k rows without replacement
max_size = 200000
if len(checkin_cleaned_df) > max_size:
    checkin_cleaned_df = checkin_cleaned_df.sample(n=max_size, replace=False)

# reset index
checkin_cleaned_df.reset_index(drop=True, inplace=True)

In [63]:
# final schema
checkin_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  200000 non-null  object
 1   date         200000 non-null  object
dtypes: object(2)
memory usage: 3.1+ MB


In [64]:
checkin_cleaned_df.head(2)

Unnamed: 0,business_id,date
0,_7849WzN9IkhmO-CRjWctg,2011-10-18 21:14:22
1,e-i6jvgVv4TqjsYT99-ajg,2011-07-31 15:22:58


### user data

In [67]:
data_file = open("yelp_academic_dataset_user.json")
data = []
for line in data_file:
    data.append(json.loads(line))
user_df = pd.DataFrame(data)
data_file.close()

In [68]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 333.7+ MB


In [70]:
user_df.head(2)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946


In [None]:
# Create a copy of user_df called user_cleaned_df
user_cleaned_df = user_df.copy()

# keep only necessary columns
user_cleaned_df = user_cleaned_df[["user_id", "name", "yelping_since", "elite"]]

# drop nulls
user_cleaned_df.dropna(axis=0, inplace=True)

# sample 400k rows without replacement
max_size = 400000
if len(user_cleaned_df) > max_size:
    user_cleaned_df = user_cleaned_df.sample(n=max_size, replace=False)

# reset index
user_cleaned_df.reset_index(drop=True, inplace=True)

In [72]:
# final schema
user_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   user_id        400000 non-null  object
 1   name           400000 non-null  object
 2   yelping_since  400000 non-null  object
 3   elite          400000 non-null  object
dtypes: object(4)
memory usage: 12.2+ MB


In [76]:
user_cleaned_df.head(2)

Unnamed: 0,user_id,name,yelping_since,elite
0,_jlnfcSbOUk_BWE3fSxGTA,Jamie,2014-12-08 21:11:18,
1,j04hg8W43ljo21yEMtsJBw,Ashley,2016-05-22 18:57:10,


### review data

In [85]:
data_file = open("yelp_academic_dataset_review.json")
data = []
for line in data_file:
    data.append(json.loads(line))
review_df = pd.DataFrame(data)

In [86]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   review_id    object 
 1   user_id      object 
 2   business_id  object 
 3   stars        float64
 4   useful       int64  
 5   funny        int64  
 6   cool         int64  
 7   text         object 
 8   date         object 
dtypes: float64(1), int64(3), object(5)
memory usage: 480.0+ MB


In [87]:
review_df.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18


In [88]:
# keep only desired columns
review_cleaned_df = review_df[["review_id", "user_id", "business_id", "stars", "text", "date"]].copy()

#impose foreign key constraints
review_cleaned_df = review_cleaned_df[review_cleaned_df["user_id"].isin(user_cleaned_df["user_id"].to_list()) &
                                      review_cleaned_df["business_id"].isin(business_cleaned_df["business_id"].to_list())]

# drop nulls
review_cleaned_df = review_cleaned_df.dropna(axis=0)

# sample 400k rows without replacement
max_size = 400000
if len(review_cleaned_df) > max_size:
    review_cleaned_df = review_cleaned_df.sample(n=max_size, replace=False)

# reset index
review_cleaned_df.reset_index(drop=True, inplace=True)

In [89]:
review_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    400000 non-null  object 
 1   user_id      400000 non-null  object 
 2   business_id  400000 non-null  object 
 3   stars        400000 non-null  float64
 4   text         400000 non-null  object 
 5   date         400000 non-null  object 
dtypes: float64(1), object(5)
memory usage: 18.3+ MB


In [90]:
review_cleaned_df.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,text,date
0,pf8MtIUz7HYzHRZWvzyKVQ,UM0lTNFr_sRHYDdoF0IZEw,auItliN-PnrS6m-_oJ0g6Q,1.0,So I was first introduced to Golden Corral a f...,2012-06-03 17:54:44
1,_SZ42t1gevRBDBt-ZvKfqQ,OhsASyV6SnNnbZKiiX0Feg,66uw4X0YfHk6jU8x3ECjsQ,1.0,"Yesterday, Wayne & I had a lot of errands. The...",2019-10-25 15:04:52


### tip data

In [92]:
data_file = open("yelp_academic_dataset_tip.json")
data = []
for line in data_file:
    data.append(json.loads(line))
tip_df = pd.DataFrame(data)

In [93]:
tip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908915 entries, 0 to 908914
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           908915 non-null  object
 1   business_id       908915 non-null  object
 2   text              908915 non-null  object
 3   date              908915 non-null  object
 4   compliment_count  908915 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 34.7+ MB


In [94]:
tip_df.head(2)

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0


In [None]:
# Create a copy of tip_df called tip_cleaned_df
tip_cleaned_df = tip_df.copy()

# drop unused column
tip_cleaned_df.drop('compliment_count', axis=1, inplace=True)

# trim data to satisfy the foreign key requirement to cleaned_user_df
tip_cleaned_df = tip_cleaned_df[tip_cleaned_df["user_id"].isin(user_cleaned_df['user_id']) &
                                tip_cleaned_df["business_id"].isin(business_cleaned_df['business_id'])]

# sample 200k rows without replacement
max_size = 200000
if len(tip_cleaned_df) > max_size:
    tip_cleaned_df = tip_cleaned_df.sample(n=max_size, replace=False)

# reset index
tip_cleaned_df.reset_index(drop=True, inplace=True)

In [97]:
# final schema
tip_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169777 entries, 0 to 169776
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      169777 non-null  object
 1   business_id  169777 non-null  object
 2   text         169777 non-null  object
 3   date         169777 non-null  object
dtypes: object(4)
memory usage: 5.2+ MB


In [98]:
tip_cleaned_df.head(2)

Unnamed: 0,user_id,business_id,text,date
0,MlnuJ7T14CE0JDK2ZIOx5g,MDr7KLYSPkEonvGojNEMBw,Let's go Yankees!,2011-07-20 21:52:57
1,tA1U-XSh9woo73eQmWGyAQ,xHwvbm1SJwtaZtOZzFQcmQ,If you haven't been here in a good while or i...,2016-06-11 23:18:23


### export data and examine

In [None]:
# export to a new csv
business_cleaned_df.to_csv("cleaned_business.csv", index=False)
checkin_cleaned_df.to_csv("cleaned_checkin.csv", index=False)
user_cleaned_df.to_csv("cleaned_user.csv", index=False)
review_cleaned_df.to_csv("cleaned_review.csv", index=False)
tip_cleaned_df.to_csv("cleaned_tip.csv", index=False)

In [101]:
pd.read_csv("cleaned_business.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122577 entries, 0 to 122576
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   business_id  122577 non-null  object 
 1   name         122577 non-null  object 
 2   address      118593 non-null  object 
 3   city         122577 non-null  object 
 4   state        122577 non-null  object 
 5   postal_code  122558 non-null  float64
 6   is_open      122577 non-null  int64  
 7   categories   122577 non-null  object 
 8   hours        122577 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 8.4+ MB


In [103]:
pd.read_csv("cleaned_checkin.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  200000 non-null  object
 1   date         200000 non-null  object
dtypes: object(2)
memory usage: 3.1+ MB


In [100]:
pd.read_csv("cleaned_user.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   user_id        400000 non-null  object
 1   name           399992 non-null  object
 2   yelping_since  400000 non-null  object
 3   elite          18403 non-null   object
dtypes: object(4)
memory usage: 12.2+ MB


In [104]:
pd.read_csv("cleaned_review.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    400000 non-null  object 
 1   user_id      400000 non-null  object 
 2   business_id  400000 non-null  object 
 3   stars        400000 non-null  float64
 4   text         400000 non-null  object 
 5   date         400000 non-null  object 
dtypes: float64(1), object(5)
memory usage: 18.3+ MB


In [105]:
pd.read_csv("cleaned_tip.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169777 entries, 0 to 169776
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      169777 non-null  object
 1   business_id  169777 non-null  object
 2   text         169774 non-null  object
 3   date         169777 non-null  object
dtypes: object(4)
memory usage: 5.2+ MB
