In [1]:
import pandas as pd
import os

# Total Number of entries in all the parquet files using pyspark

In [3]:
from pyspark.sql import SparkSession
import os
import time

def count_total_reviews():
    """
    Count the total number of reviews across all Parquet files
    in the parquet_output directory using PySpark.
    """
    start_time = time.time()
    
    # 1) Gather only non-empty .parquet files
    base_dir = "../../parquet_output"
    good_files = []
    total_size = 0
    
    print(f"Scanning for Parquet files in {base_dir}...")
    
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.endswith(".parquet"):
                path = os.path.join(root, f)
                file_size = os.path.getsize(path)
                if file_size > 0:
                    normalized_path = path.replace("\\", "/")  # normalize on Windows
                    good_files.append(normalized_path)
                    total_size += file_size
    
    if not good_files:
        print("No valid parquet files found.")
        return None
    
    print(f"Found {len(good_files)} Parquet files totaling {total_size:,} bytes ({total_size/(1024*1024*1024):.2f} GB)")
    
    # 2) Start Spark with appropriate configurations
    print("Initializing Spark session...")
    spark = SparkSession.builder \
        .appName("TotalReviews") \
        .config("spark.driver.memory", "20g") \
        .config("spark.executor.memory", "20g") \
        .config("spark.sql.files.ignoreCorruptFiles", "true") \
        .config("spark.network.timeout", "600s") \
        .config("spark.sql.broadcastTimeout", "600s") \
        .getOrCreate()
    
    # Set log level to reduce noise
    spark.sparkContext.setLogLevel("WARN")
    
    try:
        # 3) Read them all at once
        print("Reading Parquet files...")
        df = spark.read.parquet(*good_files)
        
        # 4) Count rows
        print("Counting total rows (this may take a while for large datasets)...")
        total_reviews = df.count()
        print(f"Total reviews: {total_reviews:,}")
        
        # Calculate duration
        duration = time.time() - start_time
        print(f"Total processing time: {duration:.2f} seconds ({duration/60:.2f} minutes)")
        
        return total_reviews
        
    except Exception as e:
        print(f"Error during processing: {e}")
        import traceback
        traceback.print_exc()
        return None
        
    finally:
        # Always stop Spark session when done
        print("Stopping Spark session...")
        spark.stop()

# Execute the function
if __name__ == "__main__":
    count_total_reviews()

Scanning for Parquet files in ../../parquet_output...
Found 20280 Parquet files totaling 7,853,833,200 bytes (7.31 GB)
Initializing Spark session...
Reading Parquet files...
Counting total rows (this may take a while for large datasets)...
Total reviews: 35,110,614
Total processing time: 23.29 seconds (0.39 minutes)
Stopping Spark session...


# Using Dask

In [3]:
import dask.dataframe as dd
import os
import glob

# 1) Gather only non-empty .parquet files
base_dir = "parquet_output"
good_files = []
for root, _, files in os.walk(base_dir):
    for f in files:
        if f.endswith(".parquet"):
            path = os.path.join(root, f)
            if os.path.getsize(path) > 0:
                good_files.append(path.replace("\\", "/"))  # normalize on Windows

if not good_files:
    print("No valid parquet files found.")
    exit(1)

# 2) Create Dask dataframe from parquet files
# The read_parquet function can take a list of files
# df = dd.read_parquet(good_files)
df = dd.read_parquet(good_files)
# 3) Count rows (compute triggers actual execution)
total_reviews = len(df)  # This returns a standard Python integer
print(f"Total reviews: {total_reviews}")

Total reviews: 35110614
