# Pyspark setup

In [None]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, year
from pyspark.sql.types import IntegerType
 
spark = SparkSession.builder.appName("CustomerAnalysis")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901") \
    .config("spark.hadoop.fs.s3a.access.key", "ACCESS_KEY")\
    .config("spark.hadoop.fs.s3a.secret.key", "SECRET_KEY")\
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.metastore.metrics.enabled", "false") \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .config("spark.executor.memory", "4g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.hadoop.fs.s3a.region", "ap-south-1") \
    .getOrCreate()

In [None]:
# Load order data from S3
orders_df = spark.read.csv("s3a://this-is-my-bucket007/order_data.csv", header=True, inferSchema=True)

# Show initial rows to verify data
orders_df.show()

In [None]:
from pyspark.sql.functions import col

# Filter orders with amount over ₹1,000
high_value_orders_df = orders_df.filter(col("amount") > 1000)

# Add discounted_price column
high_value_orders_df = high_value_orders_df.withColumn("discounted_price", col("amount") * 0.9)

In [None]:
# Group by product_category and calculate total sales
sales_by_category_df = high_value_orders_df.groupBy("product_category").agg({"discounted_price": "sum"}).withColumnRenamed("sum(discounted_price)", "total_sales")

# Show results
sales_by_category_df.show()

In [None]:
# Load customer data
customers_df = spark.read.csv("s3a://this-is-my-bucket007/customer_data.csv", header=True, inferSchema=True)

# Join customer and order DataFrames
customer_sales_df = customers_df.join(high_value_orders_df, customers_df.customer_id == high_value_orders_df.customer_id)

In [None]:
from pyspark.sql.functions import datediff, current_date

# Load employee data
employees_df = spark.read.csv("s3a://this-is-my-bucket007/employee_data.csv", header=True, inferSchema=True)

# Calculate years_of_experience
employees_df = employees_df.withColumn("years_of_experience", datediff(current_date(), col("joining_date")) / 365)

In [None]:
# Save aggregated sales data to S3 in Parquet format
sales_by_category_df.write.parquet("s3a://this-is-my-bucket007/aggregated_sales_data.parquet")