<a href="https://colab.research.google.com/github/Indresh0007/PySpark-Indresh/blob/main/UseCase_ProductSaleManagement_JAVA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import files
uploaded = files.upload()


Mounted at /content/drive


In [6]:
# Set variables
strBasePath="/content/drive/MyDrive/IBM-DE-Spark-Scala"
scala_deb_path = strBasePath+"/scala-2.12.18.deb"
spark_tgz_path = strBasePath+"/spark-3.4.1-bin-hadoop3.tgz"

!mkdir -p /content/tmp
import os
# Download Scala .deb if not cached
if not os.path.exists(scala_deb_path):
    !wget -O "{scala_deb_path}" https://github.com/scala/scala/releases/download/v2.12.18/scala-2.12.18.deb

# Download Spark tgz if not cached
if not os.path.exists(spark_tgz_path):
    !wget -O "{spark_tgz_path}" https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz

# Copy cached files to working dir
!cp "{scala_deb_path}" /content/tmp/scala-2.12.18.deb
!cp "{spark_tgz_path}" /content/tmp/spark-3.4.1-bin-hadoop3.tgz

# Install Java if not already present
!java -version || apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Install Scala
!dpkg -i /content/tmp/scala-2.12.18.deb

# Extract Spark
!tar xf /content/tmp/spark-3.4.1-bin-hadoop3.tgz -C /content

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"
os.environ["PATH"] += f":{os.environ['SPARK_HOME']}/bin"

# Confirm installation
!java -version
!scala -version
!scalac -version
!echo "Spark path: $SPARK_HOME"
!ls $SPARK_HOME

openjdk version "11.0.28" 2025-07-15
OpenJDK Runtime Environment (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1)
OpenJDK 64-Bit Server VM (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1, mixed mode, sharing)
Selecting previously unselected package scala.
(Reading database ... 126284 files and directories currently installed.)
Preparing to unpack /content/tmp/scala-2.12.18.deb ...
Unpacking scala (2.12.18-400) ...
Setting up scala (2.12.18-400) ...
Creating system group: scala
Creating system user: scala in scala with scala daemon-user and shell /bin/false
Processing triggers for man-db (2.10.2-1) ...
openjdk version "11.0.28" 2025-07-15
OpenJDK Runtime Environment (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1)
OpenJDK 64-Bit Server VM (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1, mixed mode, sharing)
Scala code runner version 2.12.18 -- Copyright 2002-2023, LAMP/EPFL and Lightbend, Inc.
Scala compiler version 2.12.18 -- Copyright 2002-2023, LAMP/EPFL and Lightbend, Inc.
Spark path: /content/

In [7]:
!pip install -q pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ClassicModels Spark Project") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

spark


In [8]:
from google.colab import files
uploaded = files.upload()



Saving customers.csv to customers.csv
Saving employees.csv to employees.csv
Saving offices.csv to offices.csv
Saving orderdetails.csv to orderdetails.csv
Saving orders.csv to orders.csv
Saving payments.csv to payments.csv
Saving productlines.csv to productlines.csv
Saving products.csv to products.csv


In [9]:
from pyspark.sql.types import *
import os

# Output directory
parquet_output_path = "/content/data/parquet"
os.makedirs(parquet_output_path, exist_ok=True)

# Define schemas for all 8 tables
schemas = {
    "productlines": StructType([
        StructField("productLine", StringType()),
        StructField("textDescription", StringType())
    ]),
    "products": StructType([
        StructField("productCode", StringType()),
        StructField("productName", StringType()),
        StructField("productLine", StringType()),
        StructField("productScale", StringType()),
        StructField("productVendor", StringType()),
        StructField("productDescription", StringType()),
        StructField("quantityInStock", IntegerType()),
        StructField("buyPrice", DoubleType()),
        StructField("MSRP", DoubleType())
    ]),
    "offices": StructType([
        StructField("officeCode", StringType()),
        StructField("city", StringType()),
        StructField("phone", StringType()),
        StructField("addressLine1", StringType()),
        StructField("addressLine2", StringType()),
        StructField("state", StringType()),
        StructField("country", StringType()),
        StructField("postalCode", StringType()),
        StructField("territory", StringType())
    ]),
    "employees": StructType([
        StructField("employeeNumber", IntegerType()),
        StructField("lastName", StringType()),
        StructField("firstName", StringType()),
        StructField("extension", StringType()),
        StructField("email", StringType()),
        StructField("officeCode", StringType()),
        StructField("reportsTo", IntegerType()),
        StructField("jobTitle", StringType())
    ]),
    "customers": StructType([
        StructField("customerNumber", IntegerType()),
        StructField("customerName", StringType()),
        StructField("contactLastName", StringType()),
        StructField("contactFirstName", StringType()),
        StructField("phone", StringType()),
        StructField("addressLine1", StringType()),
        StructField("addressLine2", StringType()),
        StructField("city", StringType()),
        StructField("state", StringType()),
        StructField("postalCode", StringType()),
        StructField("country", StringType()),
        StructField("salesRepEmployeeNumber", IntegerType()),
        StructField("creditLimit", DoubleType())
    ]),
    "payments": StructType([
        StructField("customerNumber", IntegerType()),
        StructField("checkNumber", StringType()),
        StructField("paymentDate", DateType()),
        StructField("amount", DoubleType())
    ]),
    "orders": StructType([
        StructField("orderNumber", IntegerType()),
        StructField("orderDate", DateType()),
        StructField("requiredDate", DateType()),
        StructField("shippedDate", DateType()),
        StructField("status", StringType()),
        StructField("comments", StringType()),
        StructField("customerNumber", IntegerType())
    ]),
    "orderdetails": StructType([
        StructField("orderNumber", IntegerType()),
        StructField("productCode", StringType()),
        StructField("quantityOrdered", IntegerType()),
        StructField("priceEach", DoubleType()),
        StructField("orderLineNumber", IntegerType())
    ])
}

# Read and convert CSVs to Parquet
for table, schema in schemas.items():
    csv_path = f"/content/{table}.csv"
    parquet_path = f"{parquet_output_path}/{table}"

    print(f"Reading {csv_path}")
    df = spark.read.option("header", True).schema(schema).csv(csv_path)

    print(f"Writing to {parquet_path}")
    df.write.mode("overwrite").parquet(parquet_path)

    df.show(2)


Reading /content/productlines.csv
Writing to /content/data/parquet/productlines
+------------+--------------------+
| productLine|     textDescription|
+------------+--------------------+
|Classic Cars|Attention car ent...|
| Motorcycles|Our motorcycles a...|
+------------+--------------------+
only showing top 2 rows

Reading /content/products.csv
Writing to /content/data/parquet/products
+-----------+--------------------+------------+------------+--------------------+--------------------+---------------+--------+-----+
|productCode|         productName| productLine|productScale|       productVendor|  productDescription|quantityInStock|buyPrice| MSRP|
+-----------+--------------------+------------+------------+--------------------+--------------------+---------------+--------+-----+
|   S10_1678|1969 Harley David...| Motorcycles|        1:10|     Min Lin Diecast|This replica feat...|           7933|   48.81| 95.7|
|   S10_1949|1952 Alpine Renau...|Classic Cars|        1:10|Classic Met

In [10]:
output_dir = "/content/output/processed"
os.makedirs(output_dir, exist_ok=True)


In [11]:
# Load orderdetails and products
orderdetails_df = spark.read.parquet("/content/data/parquet/orderdetails")
products_df = spark.read.parquet("/content/data/parquet/products")

# Join to get product names and calculate total quantity
top_products_df = orderdetails_df.join(
    products_df,
    on="productCode"
).groupBy("productName") \
 .agg({"quantityOrdered": "sum"}) \
 .withColumnRenamed("sum(quantityOrdered)", "totalQuantity") \
 .orderBy("totalQuantity", ascending=False) \
 .limit(10)

top_products_df.show()

# Save to Parquet
top_products_df.write.mode("overwrite").parquet(f"{output_dir}/top_10_products_quantity.parquet")


+--------------------+-------------+
|         productName|totalQuantity|
+--------------------+-------------+
|1992 Ferrari 360 ...|         1808|
|1937 Lincoln Berline|         1111|
|American Airlines...|         1085|
|1941 Chevrolet Sp...|         1076|
|1930 Buick Marque...|         1074|
|    1940s Ford truck|         1061|
|1969 Harley David...|         1057|
|   1957 Chevy Pickup|         1056|
|1964 Mercedes Tou...|         1053|
|1956 Porsche 356A...|         1052|
+--------------------+-------------+



In [12]:
orders_df = spark.read.parquet("/content/data/parquet/orders")

# Join: orders -> orderdetails -> products
revenue_df = orders_df.join(orderdetails_df, "orderNumber") \
    .join(products_df, "productCode") \
    .withColumn("revenue", orderdetails_df["quantityOrdered"] * orderdetails_df["priceEach"]) \
    .groupBy("productName") \
    .agg({"revenue": "sum"}) \
    .withColumnRenamed("sum(revenue)", "totalRevenue") \
    .orderBy("totalRevenue", ascending=False)

revenue_df.show()

# Save to Parquet
revenue_df.write.mode("overwrite").parquet(f"{output_dir}/product_revenue.parquet")


+--------------------+------------------+
|         productName|      totalRevenue|
+--------------------+------------------+
|1992 Ferrari 360 ...|         276839.98|
|   2001 Ferrari Enzo|         190755.86|
|1952 Alpine Renau...|190017.95999999996|
|2003 Harley-David...|170685.99999999997|
|   1968 Ford Mustang|161531.47999999992|
|    1969 Ford Falcon|         152543.02|
|1980s Black Hawk ...|144959.90999999997|
|1998 Chrysler Ply...|142530.62999999998|
|1917 Grand Tourin...|140535.60000000003|
|    2002 Suzuki XREO|135767.03000000003|
|1956 Porsche 356A...|         134240.71|
|  1969 Corvair Monza|132363.78999999998|
|1928 Mercedes-Ben...|132275.97999999998|
|1957 Corvette Con...|130749.31000000001|
| 1972 Alfa Romeo GTA|127924.31999999999|
|1962 LanciaA Delt...|123123.00999999998|
|1970 Triumph Spit...|         122254.75|
|1976 Ford Gran To...|          121890.6|
|1948 Porsche Type...|         121653.46|
|      1958 Setra Bus|119085.24999999999|
+--------------------+------------

In [13]:
# Load customers
customers_df = spark.read.parquet("/content/data/parquet/customers")

# Join orders → orderdetails
order_with_details = orders_df.join(orderdetails_df, "orderNumber")

# Calculate total per order
order_totals_df = order_with_details.withColumn(
    "orderTotal", orderdetails_df["quantityOrdered"] * orderdetails_df["priceEach"]
).groupBy("orderNumber", "customerNumber") \
 .agg({"orderTotal": "sum"}) \
 .withColumnRenamed("sum(orderTotal)", "totalOrderValue")

# Average per customer
avg_order_value_df = order_totals_df.groupBy("customerNumber") \
    .agg({"totalOrderValue": "avg"}) \
    .withColumnRenamed("avg(totalOrderValue)", "avgOrderValue") \
    .join(customers_df.select("customerNumber", "customerName"), "customerNumber") \
    .orderBy("avgOrderValue", ascending=False)

avg_order_value_df.show()

# Save to Parquet
avg_order_value_df.write.mode("overwrite").parquet(f"{output_dir}/avg_order_value_by_customer.parquet")


+--------------+------------------+--------------------+
|customerNumber|     avgOrderValue|        customerName|
+--------------+------------------+--------------------+
|           298|          54388.96|     Vida Sport, Ltd|
|           187|          49470.03|      AV Stores, Co.|
|           286|         45272.685|Marta's Replicas Co.|
|           227|           44954.9|Heintze Collectables|
|           259| 44611.56999999999|Toms Spezialitäte...|
|           151|        44478.4875|  Muscle Machine Inc|
|           146| 43435.11666666667|Saveley & Henriot...|
|           278| 42509.89666666667|       Rovelli Gifts|
|           386|          41835.19| L'ordine Souveniers|
|           249|41111.615000000005|  Amica Models & Co.|
|           448|          40314.51|Scandinavian Gift...|
|           239|40187.619999999995|Collectable Mini ...|
|           119|          39643.28|   La Rochelle Gifts|
|           319|          39216.08|       Mini Classics|
|           363| 38816.42999999

In [14]:
!ls /content/output/processed/


avg_order_value_by_customer.parquet  top_10_products_quantity.parquet
product_revenue.parquet


In [21]:
from pyspark.sql import functions as F

# Load the parquet files
offices_df = spark.read.parquet("/content/data/parquet/offices").alias("o")
customers_df = spark.read.parquet("/content/data/parquet/customers").alias("c")
payments_df = spark.read.parquet("/content/data/parquet/payments").alias("p")

# Join customers and payments
cust_pay_df = customers_df.join(
    payments_df,
    F.col("c.customerNumber") == F.col("p.customerNumber")
)

# Join with offices on country
region_sales_df = cust_pay_df.join(
    offices_df,
    F.col("c.country") == F.col("o.country"),
    "left"
)

# Group by officeCode, city, country (from offices)
customer_sales_summary = region_sales_df.groupBy(
    F.col("o.officeCode").alias("officeCode"),
    F.col("o.city").alias("officeCity"),
    F.col("o.country").alias("officeCountry")
).agg(
    F.countDistinct(F.col("c.customerNumber")).alias("customerCount"),
    F.sum(F.col("p.amount")).alias("totalSales")
).orderBy("totalSales", ascending=False)

# Show the final result
customer_sales_summary.show()

# Save to Parquet
customer_sales_summary.write.mode("overwrite").parquet("/content/output/processed/customer_sales_by_office.parquet")


+----------+-------------+-------------+-------------+------------------+
|officeCode|   officeCity|officeCountry|customerCount|        totalSales|
+----------+-------------+-------------+-------------+------------------+
|      null|         null|         null|           39| 3779259.460000001|
|         1|San Francisco|          USA|           35|3040029.5200000005|
|         3|          NYC|          USA|           35|        3040029.52|
|         2|       Boston|          USA|           35|        3040029.52|
|         4|        Paris|       France|           12| 965750.5799999998|
|         6|       Sydney|    Australia|            5|509385.81999999995|
|         7|       London|           UK|            5|          391503.9|
|         5|        Tokyo|        Japan|            2|167909.94999999998|
+----------+-------------+-------------+-------------+------------------+



In [22]:
!ls /content/output/processed/

avg_order_value_by_customer.parquet  product_revenue.parquet
customer_sales_by_office.parquet     top_10_products_quantity.parquet


In [23]:
from pyspark.sql import functions as F

# Load parquet files
customers_df = spark.read.parquet("/content/data/parquet/customers")
payments_df = spark.read.parquet("/content/data/parquet/payments")

# Join customers and payments on customerNumber
revenue_by_country_df = customers_df.join(
    payments_df, on="customerNumber"
).groupBy("country") \
 .agg(
     F.sum("amount").alias("totalRevenue")
 ).orderBy("totalRevenue", ascending=False)

# Show output
revenue_by_country_df.show()

# Save to Parquet
revenue_by_country_df.write.mode("overwrite").parquet("/content/output/processed/country_revenue.parquet")



+-----------+------------------+
|    country|      totalRevenue|
+-----------+------------------+
|        USA|3040029.5199999996|
|      Spain| 994438.5300000003|
|     France| 965750.5800000001|
|  Australia|509385.81999999995|
|New Zealand|         392486.59|
|         UK|391503.89999999997|
|      Italy|325254.55000000005|
|    Finland|         295149.35|
|  Singapore|261671.59999999998|
|     Canada|         205911.86|
|    Denmark|          197356.3|
|    Germany|         196470.99|
|      Japan|         167909.95|
|   Norway  |         166621.51|
|    Austria|         136119.99|
|     Sweden|         120457.09|
|Switzerland|         108777.92|
|     Norway|         104224.79|
|    Belgium|          91471.03|
|Philippines|           87468.3|
+-----------+------------------+
only showing top 20 rows



In [25]:
from pyspark.sql import functions as F

# Load data with aliases
offices_df = spark.read.parquet("/content/data/parquet/offices").alias("o")
customers_df = spark.read.parquet("/content/data/parquet/customers").alias("c")
payments_df = spark.read.parquet("/content/data/parquet/payments").alias("p")

# Join customers + payments
cust_pay_df = customers_df.join(
    payments_df, F.col("c.customerNumber") == F.col("p.customerNumber")
)

# Join result with offices on country
office_sales_df = cust_pay_df.join(
    offices_df, F.col("c.country") == F.col("o.country"), "left"
)

# Group by officeCode and officeCity
top_offices_df = office_sales_df.groupBy(
    F.col("o.officeCode").alias("officeCode"),
    F.col("o.city").alias("officeCity")
).agg(
    F.sum(F.col("p.amount")).alias("totalSales")
).orderBy("totalSales", ascending=False)

# Show result
top_offices_df.show()

# Save to Parquet
top_offices_df.write.mode("overwrite").parquet("/content/output/processed/top_offices_by_sales.parquet")


+----------+-------------+------------------+
|officeCode|   officeCity|        totalSales|
+----------+-------------+------------------+
|      null|         null| 3779259.459999999|
|         3|          NYC|3040029.5199999996|
|         2|       Boston|3040029.5199999996|
|         1|San Francisco|3040029.5199999996|
|         4|        Paris| 965750.5800000001|
|         6|       Sydney|509385.81999999995|
|         7|       London|391503.89999999997|
|         5|        Tokyo|         167909.95|
+----------+-------------+------------------+



In [26]:
from pyspark.sql import functions as F

# Load data
orderdetails_df = spark.read.parquet("/content/data/parquet/orderdetails")
products_df = spark.read.parquet("/content/data/parquet/products")

# Join and cache
product_sales_df = orderdetails_df.join(products_df, "productCode").cache()

# Action 1
top_products = product_sales_df.groupBy("productName") \
    .agg(F.sum("quantityOrdered").alias("totalQuantity")) \
    .orderBy("totalQuantity", ascending=False)

top_products.show()

# Action 2
revenue_df = product_sales_df.withColumn(
    "revenue", F.col("quantityOrdered") * F.col("priceEach")
).groupBy("productName") \
 .agg(F.sum("revenue").alias("totalRevenue")) \
 .orderBy("totalRevenue", ascending=False)

revenue_df.show()


+--------------------+-------------+
|         productName|totalQuantity|
+--------------------+-------------+
|1992 Ferrari 360 ...|         1808|
|1937 Lincoln Berline|         1111|
|American Airlines...|         1085|
|1941 Chevrolet Sp...|         1076|
|1930 Buick Marque...|         1074|
|    1940s Ford truck|         1061|
|1969 Harley David...|         1057|
|   1957 Chevy Pickup|         1056|
|1964 Mercedes Tou...|         1053|
|1956 Porsche 356A...|         1052|
|Corsair F4U ( Bir...|         1051|
|  F/A 18 Hornet 1/72|         1047|
|1980s Black Hawk ...|         1040|
|1913 Ford Model T...|         1038|
|   1997 BMW R 1100 S|         1033|
| 1972 Alfa Romeo GTA|         1030|
|1962 Volkswagen M...|         1029|
|    2002 Suzuki XREO|         1028|
|The USS Constitut...|         1020|
|   2001 Ferrari Enzo|         1019|
+--------------------+-------------+
only showing top 20 rows

+--------------------+------------------+
|         productName|      totalRevenue|
+-

In [28]:
from pyspark.sql.functions import broadcast, col

# Load data with aliases
customers_df = spark.read.parquet("/content/data/parquet/customers").alias("c")
offices_df = spark.read.parquet("/content/data/parquet/offices").alias("o")

# Perform broadcast join on country
cust_office_join = customers_df.join(
    broadcast(offices_df),
    col("c.country") == col("o.country"),
    "inner"
)

# Select non-ambiguous columns
cust_office_join.select(
    col("c.customerName").alias("customerName"),
    col("o.city").alias("officeCity"),
    col("o.country").alias("officeCountry")
).show(5)


+--------------------+-------------+-------------+
|        customerName|   officeCity|officeCountry|
+--------------------+-------------+-------------+
|   Atelier graphique|        Paris|       France|
|  Signal Gift Stores|          NYC|          USA|
|  Signal Gift Stores|       Boston|          USA|
|  Signal Gift Stores|San Francisco|          USA|
|Australian Collec...|       Sydney|    Australia|
+--------------------+-------------+-------------+
only showing top 5 rows



In [31]:
from pyspark.sql import functions as F

# Load payments DataFrame
payments_df = spark.read.parquet("/content/data/parquet/payments")

# Use DataFrame API to group by customerNumber and sum the amount
revenue_df = payments_df.groupBy("customerNumber") \
    .agg(F.sum("amount").alias("totalRevenue")) \
    .orderBy("totalRevenue", ascending=False)

# Show result
revenue_df.show()

+--------------+------------------+
|customerNumber|      totalRevenue|
+--------------+------------------+
|           141| 715738.9800000001|
|           124| 584188.2400000001|
|           114|180585.06999999998|
|           151|         177913.95|
|           148|         156251.03|
|           323|154622.08000000002|
|           187|         148410.09|
|           276|         137034.22|
|           321|         132340.78|
|           146|         130305.35|
|           278|         127529.69|
|           353|         126983.19|
|           119|116949.68000000001|
|           363|116449.29000000001|
|           496|         114497.19|
|           458|         112440.09|
|           298|         108777.92|
|           131|         107639.94|
|           145|          107446.5|
|           398|         105548.73|
+--------------+------------------+
only showing top 20 rows



In [32]:
# Load a DataFrame lazily
orders_df = spark.read.parquet("/content/data/parquet/orders")

# Transformation only — this does NOT trigger execution
transformed_df = orders_df.select("orderNumber", "orderDate", "customerNumber")

# Print execution plan — shows Spark's plan but doesn't run it
transformed_df.explain()


== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [orderNumber#4107,orderDate#4108,customerNumber#4113] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/content/data/parquet/orders], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<orderNumber:int,orderDate:date,customerNumber:int>




In [33]:
# This is an ACTION — it triggers the actual execution
row_count = transformed_df.count()

print("Total Orders:", row_count)


Total Orders: 326


In [34]:
def task1_ingest_and_parquet():
    # Read all 8 CSVs, define schema, write to /data/parquet/
    pass

def task2_product_order_analysis():
    # 2.1 Top 10 products by quantity
    # 2.2 Product revenue
    # 2.3 Avg order value by customer
    pass

def task3_regional_sales_insights():
    # 3.1 Customer sales by office
    # 3.2 Revenue by country
    # 3.3 Top offices
    pass

def task4_performance_optimization():
    # 4.1 cache()
    # 4.2 broadcast()
    # 4.3 aggregateByKey (or groupBy)
    # 4.4 explain(), count()
    pass

def main():
    task1_ingest_and_parquet()
    task2_product_order_analysis()
    task3_regional_sales_insights()
    task4_performance_optimization()

# Run like a main class
main()


In [36]:
import os

# List all files in the output directory
os.listdir("/content/output/processed/")


['customer_sales_by_office.parquet',
 'top_offices_by_sales.parquet',
 'top_10_products_quantity.parquet',
 'product_revenue.parquet',
 'country_revenue.parquet',
 'avg_order_value_by_customer.parquet']

In [37]:
!zip -r output_processed.zip /content/output/processed/


  adding: content/output/processed/ (stored 0%)
  adding: content/output/processed/customer_sales_by_office.parquet/ (stored 0%)
  adding: content/output/processed/customer_sales_by_office.parquet/._SUCCESS.crc (stored 0%)
  adding: content/output/processed/customer_sales_by_office.parquet/part-00000-7b4303d8-1b3d-44d6-8b9a-f79d3c99c319-c000.snappy.parquet (deflated 45%)
  adding: content/output/processed/customer_sales_by_office.parquet/_SUCCESS (stored 0%)
  adding: content/output/processed/customer_sales_by_office.parquet/.part-00000-7b4303d8-1b3d-44d6-8b9a-f79d3c99c319-c000.snappy.parquet.crc (stored 0%)
  adding: content/output/processed/top_offices_by_sales.parquet/ (stored 0%)
  adding: content/output/processed/top_offices_by_sales.parquet/._SUCCESS.crc (stored 0%)
  adding: content/output/processed/top_offices_by_sales.parquet/_SUCCESS (stored 0%)
  adding: content/output/processed/top_offices_by_sales.parquet/.part-00000-1c0a5122-1fc8-4763-a63c-694176178f13-c000.snappy.parquet

In [40]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
import os

drive_output_dir = '/content/drive/MyDrive/spark_output_files'
os.makedirs(drive_output_dir, exist_ok=True)


In [42]:
import shutil

local_output_dir = '/content/output/processed/'

# Copy all files from local to Drive folder
for file_name in os.listdir(local_output_dir):
    full_local_path = os.path.join(local_output_dir, file_name)
    if os.path.isfile(full_local_path):
        shutil.copy(full_local_path, drive_output_dir)


In [43]:
os.listdir('/content/output/processed/')


['customer_sales_by_office.parquet',
 'top_offices_by_sales.parquet',
 'top_10_products_quantity.parquet',
 'product_revenue.parquet',
 'country_revenue.parquet',
 'avg_order_value_by_customer.parquet']

In [45]:
import shutil
import os

# Source and target directories
local_output_dir = '/content/output/processed'
drive_output_dir = '/content/drive/MyDrive/spark_output_files'
os.makedirs(drive_output_dir, exist_ok=True)

# Copy entire folders (each .parquet is a directory)
for item in os.listdir(local_output_dir):
    src_path = os.path.join(local_output_dir, item)
    dst_path = os.path.join(drive_output_dir, item)

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        print(f"Copied folder: {item}")
    else:
        print(f"Skipped: {item} (not a directory)")


Copied folder: customer_sales_by_office.parquet
Copied folder: top_offices_by_sales.parquet
Copied folder: top_10_products_quantity.parquet
Copied folder: product_revenue.parquet
Copied folder: country_revenue.parquet
Copied folder: avg_order_value_by_customer.parquet
