<a href="https://colab.research.google.com/github/RedFiringSun/SchemaSquad/blob/main/CS226_Big_Data_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

!pip install pyspark

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession
import os
import time
import json

# Create a more memory-efficient Spark session
spark = SparkSession.builder \
    .appName("Yelp JSON Analysis") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.default.parallelism", "10") \
    .getOrCreate()

def analyze_json_in_chunks(file_path):
    try:
        filename = os.path.basename(file_path)
        print(f"\nProcessing: {filename}")
        start_time = time.time()

        # Read the JSON file
        df = spark.read.json(file_path)

        # Get schema without counting all records
        print("\nSchema:")
        df.printSchema()

        # Show sample without counting
        print("\nSample of first 5 records:")
        df.limit(5).show(truncate=True)

        # Get approximate count for large files
        if 'review' in filename:
            print("\nGetting approximate count for large dataset...")
            # Use more efficient counting for large datasets
            count = df.rdd.countApprox(timeout=10000, confidence=0.95)
            print(f"Approximate total records: {count:,}")
        else:
            count = df.count()
            print(f"\nTotal records: {count:,}")

        print(f"Number of columns: {len(df.columns)}")
        print(f"Time taken: {time.time() - start_time:.2f} seconds")

    except Exception as e:
        print(f"Error analyzing {file_path}:")
        print(str(e))

# Process files in order of size (smallest to largest)
json_files = [
    'yelp_academic_dataset_checkin.json',
    'yelp_academic_dataset_tip.json',
    'yelp_academic_dataset_business.json',
    #'yelp_academic_dataset_user.json',
    #'yelp_academic_dataset_review.json'  # Process largest file last
]

for json_file in json_files:
    file_path = f'/content/drive/MyDrive/YelpDataset/{json_file}'
    if os.path.exists(file_path):
        print(f"\n{'='*50}")
        analyze_json_in_chunks(file_path)
        # Clear cache after each file
        spark.catalog.clearCache()
    else:
        print(f"File not found: {json_file}")

# Clean up
spark.stop()



Processing: yelp_academic_dataset_checkin.json

Schema:
root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)


Sample of first 5 records:
+--------------------+--------------------+
|         business_id|                date|
+--------------------+--------------------+
|---kPU91CF4Lq2-Wl...|2020-03-13 21:10:...|
|--0iUa4sNDFiZFrAd...|2010-09-13 21:43:...|
|--30_8IhuyMHbSOcN...|2013-06-14 23:29:...|
|--7PUidqRWpRSpXeb...|2011-02-15 17:12:...|
|--7jw19RH9JKXgFoh...|2014-04-21 20:42:...|
+--------------------+--------------------+


Total records: 131,930
Number of columns: 2
Time taken: 7.93 seconds


Processing: yelp_academic_dataset_tip.json

Schema:
root
 |-- business_id: string (nullable = true)
 |-- compliment_count: long (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: string (nullable = true)


Sample of first 5 records:
+--------------------+----------------+-------------------+---------------