<a href="https://colab.research.google.com/github/MattBaudoin/SCMT610_GroupProject/blob/main/Group_Project_Part_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Group 1

Matthew Baudoin
Kevin Brown
Chelsea Jacobo
Grace Morris
Stephie Noel
Kal Zapotocky

In [1]:
# Environment setup: Mount Google Drive and load dependencies

from google.colab import drive
drive.mount('/content/drive/')

import os
import json
import time
import pandas as pd


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
#Dataset configuration: define path and verify accessibility

BASE = '/content/drive/Shared drives/Team1Share/GroupProject/Yelp_Dataset'
FILE = 'yelp_academic_dataset_review.json'
DATA_PATH = os.path.join(BASE, FILE)

assert os.path.exists(DATA_PATH), f"Dataset not found: {DATA_PATH}"
print("✅ Dataset path verified")

✅ Dataset path verified


In [3]:
# Load dataset from JSON Lines file
# NOTE: Attempting to load the full dataset (~5GB) will crash the colab runtime environment.
# The following code is retained for reference, but is not exectuted.

# df_raw = pd.read_json(DATA_PATH, lines=True)

# Instead, Load a small sample of the JSON Lines file for exploratory analysis
rows = []
with open(DATA_PATH, "r") as f:
    for i, line in enumerate(f):
        rows.append(json.loads(line))
        if i >= 10_000:   # sample size for exploratory inspection
            break

df_sample = pd.DataFrame(rows)

In [4]:
# Preview and inspect dataset
df_sample.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [5]:
#Dataset dimensions and schema

print("Shape:", df_sample.shape)
print("Columns:", df_sample.columns)
df_sample.info()



Shape: (10001, 9)
Columns: Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   review_id    10001 non-null  object 
 1   user_id      10001 non-null  object 
 2   business_id  10001 non-null  object 
 3   stars        10001 non-null  float64
 4   useful       10001 non-null  int64  
 5   funny        10001 non-null  int64  
 6   cool         10001 non-null  int64  
 7   text         10001 non-null  object 
 8   date         10001 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 703.3+ KB


In [6]:
# Dataset Construction using only the columns required for analysis.
# Useful columns are: review_id, business_id, stars, text, date
#
# NOTE: Tried using CSV for the slimmed dataset, however it is still too large to load into Colabs memory.
# After research, parquet is the better file format, especially for large datasets for ML/Analytics pipelines

# Declare path where reduced dataset files will be written
OUT_DIR = (
    '/content/drive/Shared drives/Team1Share/GroupProject/'
    'Yelp_Dataset/reviews_slim_parquet'
)

# Ensure output directory exists
os.makedirs(OUT_DIR, exist_ok=True)

# Record start time to measure end-to-end write performance
start_time = time.time()

# Counter used to name sequential Parquet part files
n_parts = 0

# Stream the large JSON Lines file in blocks to avoid memory issues
# Each iteration writes one self-contained Parquet partition;
# the full dataset is represented by the collection of part files
for df_block in pd.read_json(DATA_PATH, lines=True, chunksize=100_000):
    df_block = df_block[['review_id', 'business_id', 'stars', 'text', 'date']]

    # Construct a unique filename for this block to create a partitioned dataset
    part_path = os.path.join(OUT_DIR, f'part_{n_parts:05d}.parquet')
    # Write the current block to disk as a compressed Parquet file
    df_block.to_parquet(part_path, compression='snappy', index=False)

    # Increment part counter for the next block
    n_parts += 1

# Convert elapsed time into minutes and seconds for readability
elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60

# Report output location, number of files written, and runtime
print(f"Reduced dataset written to directory: {OUT_DIR}")
print(f"Parquet files written: {n_parts}")
print(f"Time elapsed: {minutes} minutes {seconds:.1f} seconds")


Reduced dataset written to directory: /content/drive/Shared drives/Team1Share/GroupProject/Yelp_Dataset/reviews_slim_parquet
Parquet files written: 70
Time elapsed: 3 minutes 25.4 seconds


In [None]:
# Load reduced dataset from Parquet directory
# This cell is expensive, run it by itself

df_reduced = pd.read_parquet(OUT_DIR)



In [8]:
# Preview Data

df_reduced.head()


Unnamed: 0,review_id,business_id,stars,text,date
0,KU_O5udG6zpxOg-VcAEodg,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [9]:
# Examine dataset structure

df_reduced.shape
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   business_id  object        
 2   stars        int64         
 3   text         object        
 4   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 266.7+ MB
