<a href="https://colab.research.google.com/github/MattBaudoin/SCMT610_GroupProject/blob/main/Group_Project_Part_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Group 1

Matthew Baudoin
Kevin Brown
Chelsea Jacobo
Grace Morris
Stephie Noel
Kal Zapotocky

In [None]:
# Environment setup: Mount Google Drive and load dependencies

from google.colab import drive
drive.mount('/content/drive/')

import os
import json
import time
import pandas as pd


In [2]:
#Dataset configuration: define path and verify accessibility

BASE = '/content/drive/Shared drives/Team1Share/GroupProject/Yelp_Dataset'
FILE = 'yelp_academic_dataset_review.json'
DATA_PATH = os.path.join(BASE, FILE)

assert os.path.exists(DATA_PATH), f"Dataset not found: {DATA_PATH}"
print("✅ Dataset path verified")

✅ Dataset path verified


In [3]:
# Load dataset from JSON Lines file
# NOTE: Attempting to load the full dataset (~5GB) will crash the colab runtime environment.
# The following code is retained for reference, but is not exectuted.

# df_raw = pd.read_json(DATA_PATH, lines=True)

# Instead, Load a small sample of the JSON Lines file for exploratory analysis
rows = []
with open(DATA_PATH, "r") as f:
    for i, line in enumerate(f):
        rows.append(json.loads(line))
        if i >= 10_000:   # sample size for exploratory inspection
            break

df_sample = pd.DataFrame(rows)

In [None]:
# Preview and inspect dataset
df_sample.head()

In [None]:
#Dataset dimensions and schema

print("Shape:", df_sample.shape)
print("Columns:", df_sample.columns)
df_sample.info()



In [None]:
# Dataset Construction using only the columns required for analysis.
# Useful columns are: review_id, business_id, stars, text, date
#
# NOTE: Tried using CSV for the slimmed dataset, however it is still too large to load into Colabs memory.
# After research, parquet is the better file format, especially for large datasets for ML/Analytics pipelines

# Declare path where reduced dataset files will be written
OUT_DIR = (
    '/content/drive/Shared drives/Team1Share/GroupProject/'
    'Yelp_Dataset/reviews_slim_parquet'
)

# Ensure output directory exists
os.makedirs(OUT_DIR, exist_ok=True)

# Record start time to measure end-to-end write performance
start_time = time.time()

# Counter used to name sequential Parquet part files
n_parts = 0

# Stream the large JSON Lines file in blocks to avoid memory issues
# Each iteration writes one self-contained Parquet partition;
# the full dataset is represented by the collection of part files
for df_block in pd.read_json(DATA_PATH, lines=True, chunksize=100_000):
    df_block = df_block[['review_id', 'business_id', 'stars', 'text', 'date']]

    # Construct a unique filename for this block to create a partitioned dataset
    part_path = os.path.join(OUT_DIR, f'part_{n_parts:05d}.parquet')
    # Write the current block to disk as a compressed Parquet file
    df_block.to_parquet(part_path, compression='snappy', index=False)

    # Increment part counter for the next block
    n_parts += 1

# Convert elapsed time into minutes and seconds for readability
elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60

# Report output location, number of files written, and runtime
print(f"Reduced dataset written to directory: {OUT_DIR}")
print(f"Parquet files written: {n_parts}")
print(f"Time elapsed: {minutes} minutes {seconds:.1f} seconds")


In [None]:
# Load reduced dataset from Parquet directory
# This cell is expensive, run it by itself

df_reduced = pd.read_parquet(OUT_DIR)



In [None]:
# Preview Data

df_reduced.head()


In [None]:
# Examine dataset structure

df_reduced.shape
df_reduced.info()