<a href="https://colab.research.google.com/github/LathaAlagar/latha2809/blob/main/23BIT050_In_Memory_Data_Processing_Challenge_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum

# ---------------------- Step 1: Initialize Spark ----------------------
spark = SparkSession.builder.appName("InMemoryDataProcessing").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
print("✅ Spark session created")

# ---------------------- Step 2: Generate large dataset ----------------------
N = 1000000  # 1 million rows
data = pd.DataFrame({
    "customer_id": [random.randint(1, 100000) for _ in range(N)],
    "transaction_amount": [round(random.uniform(5, 500),2) for _ in range(N)],
    "category": [random.choice(["A","B","C","D"]) for _ in range(N)],
    "age": [random.randint(18, 70) for _ in range(N)]
})
print("✅ Synthetic dataset created with 1,000,000 rows")

# ---------------------- Step 3: Create Spark DataFrame ----------------------
df = spark.createDataFrame(data)
print("✅ Spark DataFrame created")

# ---------------------- Step 4: Analytical query without caching ----------------------
start = time.time()
result1 = df.groupBy("category").agg(avg("transaction_amount").alias("avg_amount"), sum("transaction_amount").alias("total_amount"))
result1.show(5)
end = time.time()
print(f"⏱ Execution time without cache: {end-start:.2f} seconds")

# ---------------------- Step 5: Cache DataFrame in memory ----------------------
df.cache()
df.count()  # Trigger caching
print("✅ DataFrame cached in memory")

# ---------------------- Step 6: Analytical query with caching ----------------------
start = time.time()
result2 = df.groupBy("category").agg(avg("transaction_amount").alias("avg_amount"), sum("transaction_amount").alias("total_amount"))
result2.show(5)
end = time.time()
print(f"⏱ Execution time with cache: {end-start:.2f} seconds")

# ---------------------- Step 7: Optional - Additional real-time analysis ----------------------
# Example: top 5 customers by transaction amount
start = time.time()
top_customers = df.groupBy("customer_id").agg(sum("transaction_amount").alias("total_amount")).orderBy(col("total_amount").desc())
top_customers.show(5)
end = time.time()
print(f"⏱ Execution time for top customers query: {end-start:.2f} seconds")

# ---------------------- Step 8: Stop Spark ----------------------
spark.stop()
print("✅ Spark session stopped")


✅ Spark session created
✅ Synthetic dataset created with 1,000,000 rows
✅ Spark DataFrame created
+--------+------------------+-------------------+
|category|        avg_amount|       total_amount|
+--------+------------------+-------------------+
|       B|252.36908405453337|6.295826303000039E7|
|       D|252.79629827785737|6.326732956999937E7|
|       C| 252.5064637945096|6.322862856000038E7|
|       A|252.60327323228807| 6.31146960399998E7|
+--------+------------------+-------------------+

⏱ Execution time without cache: 13.52 seconds
✅ DataFrame cached in memory
+--------+------------------+-------------------+
|category|        avg_amount|       total_amount|
+--------+------------------+-------------------+
|       B|252.36908405453337|6.295826303000039E7|
|       D|252.79629827785737|6.326732956999937E7|
|       C| 252.5064637945096|6.322862856000038E7|
|       A|252.60327323228807| 6.31146960399998E7|
+--------+------------------+-------------------+

⏱ Execution time with cac