In [None]:
!pip install google-cloud-storage datasets 


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import sys
import shutil
import glob
import itertools
import time
from itertools import islice

import numpy as np
import pandas as pd

from datasets import load_dataset

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType

from google.cloud import storage

# ------------------------- CONFIG -------------------------
CREDENTIAL_PATH = "/kaggle/input/bigdata-inj/bigdataproject-456014-42479bab67be.json"
BUCKET_NAME = "team15-storage"
BATCH_SIZE = 50000  # Increased for better performance
NUM_PARTITIONS = 24  # Increased to parallelize better

# Categories grouped by size
small_categories = [
    "raw_review_Magazine_Subscriptions",
    "raw_review_Gift_Cards",
    "raw_review_All_Beauty",
    "raw_review_Software",
    "raw_review_Musical_Instruments",
    "raw_review_Industrial_and_Scientific"
]

medium_categories = [
    "raw_review_Baby_Products",
    "raw_review_Office_Products",
    "raw_review_CDs_and_Vinyl",
    "raw_review_Arts_Crafts_and_Sewing",
    "raw_review_Cell_Phones_and_Accessories",
    "raw_review_Pet_Supplies",
    "raw_review_Grocery_and_Gourmet_Food",
    "raw_review_Patio_Lawn_and_Garden",
    "raw_review_Toys_and_Games",
    "raw_review_Health_and_Household",
    "raw_review_Movies_and_TV",
    "raw_review_Beauty_and_Personal_Care"
]

large_categories = [
    "raw_review_Automotive",
    "raw_review_Tools_and_Home_Improvement",
    "raw_review_Electronics",
    "raw_review_Books",
    "raw_review_Clothing_Shoes_and_Jewelry",
    "raw_review_Home_and_Kitchen"
]

raw_review_categories = small_categories + medium_categories + large_categories

schema = StructType([
    StructField("main_category", StringType(), True),
    StructField("parent_asin", StringType(), True),
    StructField("title", StringType(), True),
    StructField("average_rating", StringType(), True),
    StructField("rating_number", StringType(), True),
])

spinner = itertools.cycle(["-", "\\", "|", "/"])

# --------------------- INIT SPARK & GCS --------------------
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = CREDENTIAL_PATH
spark = SparkSession.builder \
    .appName("AmazonReviews") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)

print("Spark Master:", spark.sparkContext.master)

# Track completed datasets
completed_file = "completed_categories.txt"
if os.path.exists(completed_file):
    with open(completed_file) as f:
        completed = set(line.strip() for line in f)
else:
    completed = set()

# ----------------------- MAIN PIPELINE ----------------------
for raw_cat in raw_review_categories:
    cat_name = raw_cat.replace("raw_review_", "")
    dataset_name = f"raw_meta_{cat_name}"

    if dataset_name in completed:
        print(f"⏭ Skipping {dataset_name} (already done)")
        continue

    print(f"\n Loading dataset: {dataset_name}")
    start_time = time.time()

    try:
        dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", dataset_name, streaming=True, trust_remote_code=True)
        data_iter = iter(dataset["full"])

        total_loaded = 0
        batch_count = 0

        while True:
            batch = list(islice(data_iter, BATCH_SIZE))
            if not batch:
                break

            rdd = spark.sparkContext.parallelize(batch, numSlices=NUM_PARTITIONS)
            df_batch = spark.createDataFrame(rdd, schema=schema)

            temp_parquet_path = f"temp_{cat_name}_batch_{batch_count}.parquet"
            df_batch.write.mode("overwrite").parquet(temp_parquet_path)

            part_files = glob.glob(os.path.join(temp_parquet_path, "part-*.parquet"))
            for part_file in part_files:
                try:
                    blob = bucket.blob(f"item/{dataset_name}/{os.path.basename(part_file)}")
                    blob.upload_from_filename(part_file)
                except Exception as e:
                    print(f"Error uploading {part_file}: {e}")

            shutil.rmtree(temp_parquet_path)

            total_loaded += len(batch)
            sys.stdout.write(f"\r{next(spinner)} Loading {total_loaded:,} rows from {dataset_name}")
            sys.stdout.flush()

            batch_count += 1

        duration = (time.time() - start_time) / 60
        print(f"\n✅ Completed {dataset_name} in {duration:.2f} min: {total_loaded:,} rows uploaded.")

        with open(completed_file, "a") as f:
            f.write(dataset_name + "\n")

    except Exception as e:
        print(f" Failed to load {dataset_name}: {e}")

print("\nAll categories processed!")
