## Downloading raw data

In [4]:
import os
import requests

# URLs of the files 
urls = [
    "https://storage.googleapis.com/penn-cis5450/yelp_business.csv",
    "https://storage.googleapis.com/penn-cis5450/yelp_checkin.csv",
    "https://storage.googleapis.com/penn-cis5450/yelp_tip.csv",
    "https://storage.googleapis.com/penn-cis5450/yelp_user.csv"
]

# Download each file
for url in urls:
    file_name = url.split('/')[-1] 
    file_path = os.path.join(os.getcwd(), file_name)

    # Check if the file already exists
    if os.path.exists(file_path):
        continue

    response = requests.get(url)
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {file_name}")
    else:
        print(f"Failed to download {file_name}")


## EDA

In [5]:
import pandas as pd

In [6]:
business_df = pd.read_csv(os.path.join(os.getcwd(), "yelp_business.csv"))
checkin_df = pd.read_csv(os.path.join(os.getcwd(), "yelp_checkin.csv"))
tip_df = pd.read_csv(os.path.join(os.getcwd(), "yelp_tip.csv"))
user_df = pd.read_csv(os.path.join(os.getcwd(), "yelp_user.csv"))

### business data

In [7]:
business_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174567 entries, 0 to 174566
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   174567 non-null  object 
 1   name          174567 non-null  object 
 2   neighborhood  68015 non-null   object 
 3   address       174567 non-null  object 
 4   city          174566 non-null  object 
 5   state         174566 non-null  object 
 6   postal_code   173944 non-null  object 
 7   latitude      174566 non-null  float64
 8   longitude     174566 non-null  float64
 9   stars         174567 non-null  float64
 10  review_count  174567 non-null  int64  
 11  is_open       174567 non-null  int64  
 12  categories    174567 non-null  object 
dtypes: float64(3), int64(2), object(8)
memory usage: 17.3+ MB


In [8]:
business_df.head(2)

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.1049,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...


In [9]:
# create a copy
business_cleaned_df = business_df.copy()

# Remove extra quotations from name and address
business_cleaned_df["name"] = business_cleaned_df["name"].str.strip('"')
business_cleaned_df["address"] = business_cleaned_df["address"].str.strip('"')

# Convert categories into a list of strings
business_cleaned_df["categories"] = business_cleaned_df["categories"].str.split(';')

# Drop neighborhood column as it is mostly null
business_cleaned_df.drop("neighborhood", axis=1, inplace=True)

# Create a column called is_restaurant where the value is 1 if 'Restaurants' is within the list in the column categories and is 0 otherwise
business_cleaned_df["is_restaurant"] = business_cleaned_df["categories"].apply(lambda x : 1 if "Restaurants" in x else 0)

# Standardize the city column using regex (remove extra spaces and capitalizing words)
business_cleaned_df["city"] = business_cleaned_df["city"].str.replace(r"\s+", " ", regex=True).str.strip().str.title()

# Drop and reset index
business_cleaned_df.reset_index(drop=True, inplace=True)

# export to a new csv
business_cleaned_df.to_csv("cleaned_business.csv")

In [10]:
# final schema
business_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174567 entries, 0 to 174566
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   business_id    174567 non-null  object 
 1   name           174567 non-null  object 
 2   address        174567 non-null  object 
 3   city           174566 non-null  object 
 4   state          174566 non-null  object 
 5   postal_code    173944 non-null  object 
 6   latitude       174566 non-null  float64
 7   longitude      174566 non-null  float64
 8   stars          174567 non-null  float64
 9   review_count   174567 non-null  int64  
 10  is_open        174567 non-null  int64  
 11  categories     174567 non-null  object 
 12  is_restaurant  174567 non-null  int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 17.3+ MB


### checkin data

In [11]:
checkin_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911218 entries, 0 to 3911217
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   business_id  3911218 non-null  object
 1   weekday      3911218 non-null  object
 2   hour         3911218 non-null  object
 3   checkins     3911218 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 119.4+ MB


In [12]:
checkin_df.head(2)

Unnamed: 0,business_id,weekday,hour,checkins
0,3Mc-LxcqeguOXOVT_2ZtCg,Tue,0:00,12
1,SVFx6_epO22bZTZnKwlX7g,Wed,0:00,4


In [23]:
# Create a copy of checkin_df called  checkin_cleaned_df
checkin_cleaned_df = checkin_df.copy()

# export the csv as there no intresting process needed
checkin_cleaned_df.to_csv("cleaned_checkin.csv")

In [24]:
# final schema
checkin_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911218 entries, 0 to 3911217
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   business_id  object
 1   weekday      object
 2   hour         object
 3   checkins     int64 
dtypes: int64(1), object(3)
memory usage: 119.4+ MB


### user data

In [15]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326100 entries, 0 to 1326099
Data columns (total 22 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   user_id             1326100 non-null  object 
 1   name                1325585 non-null  object 
 2   review_count        1326100 non-null  int64  
 3   yelping_since       1326100 non-null  object 
 4   friends             760007 non-null   object 
 5   useful              1326100 non-null  int64  
 6   funny               1326100 non-null  int64  
 7   cool                1326100 non-null  int64  
 8   fans                1326100 non-null  int64  
 9   elite               60818 non-null    object 
 10  average_stars       1326100 non-null  float64
 11  compliment_hot      1326100 non-null  int64  
 12  compliment_more     1326100 non-null  int64  
 13  compliment_profile  1326100 non-null  int64  
 14  compliment_cute     1326100 non-null  int64  
 15  compliment_list

In [16]:
user_df.head(2)

Unnamed: 0,user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,elite,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,JJ-aSuM4pCFPdkfoZ34q0Q,Chris,10,2013-09-24,"0njfJmB-7n84DlIgUByCNw, rFn3Xe3RqHxRSxWOU19Gpg...",0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
1,uUzsFQn_6cXDh6rPNGbIFA,Tiffy,1,2017-03-02,,0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Create a copy of user_df called user_cleaned_df
user_cleaned_df = user_df.copy()

# Strip the friends column of excess spaces and convert type to string.
user_cleaned_df["friends"] = user_cleaned_df["friends"].str.strip()

# Drop unwanted columns
user_cleaned_df = user_cleaned_df[["user_id", "name", "review_count", "yelping_since", "elite", "average_stars"]]

# pick an arbitrary date to trim data
user_cleaned_df = user_cleaned_df[user_cleaned_df["yelping_since"] > "2013-09-24"]

# Sort by user_id in an ascending order
user_cleaned_df.sort_values("user_id", inplace=True)
user_cleaned_df.reset_index(drop=True, inplace=True)

user_cleaned_df.to_csv("cleaned_user.csv")

In [27]:
# final schema
user_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663111 entries, 0 to 663110
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   user_id        663111 non-null  object 
 1   name           662745 non-null  object 
 2   review_count   663111 non-null  int64  
 3   yelping_since  663111 non-null  object 
 4   elite          11675 non-null   object 
 5   average_stars  663111 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 30.4+ MB


### tip data

In [28]:
tip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098324 entries, 0 to 1098323
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   text         1098319 non-null  object
 1   date         1098324 non-null  object
 2   likes        1098324 non-null  int64 
 3   business_id  1098324 non-null  object
 4   user_id      1098324 non-null  object
dtypes: int64(1), object(4)
memory usage: 41.9+ MB


In [29]:
tip_df.head(2)

Unnamed: 0,text,date,likes,business_id,user_id
0,Great breakfast large portions and friendly wa...,2015-08-12,0,jH19V2I9fIslnNhDzPmdkA,ZcLKXikTHYOnYt5VYRO5sg
1,Nice place. Great staff. A fixture in the tow...,2014-06-20,0,dAa0hB2yrnHzVmsCkN4YvQ,oaYhjqBbh18ZhU0bpyzSuw


In [30]:
# Create a copy of tip_df called tip_cleaned_df
tip_cleaned_df = tip_df.copy()

# Convert the data in date column from string to datetime64[ns]
tip_cleaned_df["date"] = pd.to_datetime(tip_cleaned_df["date"], format='%Y-%m-%d')

# trim data to satisfy the foreign key requirement to cleaned_user_df
tip_cleaned_df = tip_cleaned_df[tip_cleaned_df["user_id"].isin(user_cleaned_df['user_id'])]

# reset index
tip_cleaned_df.reset_index(drop=True, inplace=True)

# export data
tip_cleaned_df.to_csv("cleaned_tip.csv")

In [22]:
# final schema
tip_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247599 entries, 0 to 247598
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   text         247599 non-null  object        
 1   date         247599 non-null  datetime64[ns]
 2   likes        247599 non-null  int64         
 3   business_id  247599 non-null  object        
 4   user_id      247599 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 9.4+ MB
