# Pyspark

In [None]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, year
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("MonitorAnalysis")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901") \
    .config("spark.hadoop.fs.s3a.access.key", "ACCESS_KEY")\
    .config("spark.hadoop.fs.s3a.secret.key", "SECRET_KEY")\
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.metastore.metrics.enabled", "false") \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .config("spark.executor.memory", "4g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.hadoop.fs.s3a.region", "ap-south-1") \
    .getOrCreate()

**OS Setup**

In [None]:
import os

os.environ['AWS_ACCESS_KEY_ID'] = 'ACCESS_KEY'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'SECRET_KEY'
os.environ['AWS_DEFAULT_REGION'] = 'ap-south-1'

**Boto Setup**

In [3]:
pip install boto3



Boto

In [None]:
import boto3

# Create a session with your credentials
session = boto3.Session(
    aws_access_key_id='ACCESS_KEY',
    aws_secret_access_key='SECRET_KEY',
    region_name='ap-south-1'  # e.g., 'us-east-1'
)

dynamodb = session.resource('dynamodb')
table = dynamodb.Table('customers')
# anomaly_table = dynamodb.Table('anomaly')

# RetailMart wants to add insights

In [5]:
response = table.scan()
cust_data = response['Items']

In [6]:
# Convert the DynamoDB items to DataFrame
cust_df = spark.createDataFrame(cust_data)

In [12]:
# Load data from s3
sales_input = "s3a://this-is-my-bucket007/transactions.csv"
trans_df = spark.read.csv(sales_input, header=True, inferSchema=True)


In [13]:
print("Customer Data from DynamoDB:")
cust_df.show()
print("Transaction Data from S3:")
trans_df.show()

Customer Data from DynamoDB:
+-----------+-------------+--------------+
|customer_id|customer_name|      inactive|
+-----------+-------------+--------------+
|       C125|        manoj|{BOOL -> true}|
|       C126|          Leo|{BOOL -> true}|
|       C123|         John|{BOOL -> true}|
|       C124|        brock|{BOOL -> true}|
+-----------+-------------+--------------+

Transaction Data from S3:
+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_date|amount|product_category|
+--------------+-----------+----------------+------+----------------+
|          T001|       C123|      2024-01-15|  1200|     Electronics|
|          T002|       C124|      2024-01-16|   800|        Clothing|
|          T003|       C125|      2024-01-17|  1500| Home Appliances|
|          T004|       C123|      2024-01-18|  2000|     Electronics|
|          T005|       C126|      2024-01-19|   500|           Books|
|          T006|       C127|      2024-0

In [10]:
from pyspark.sql.functions import col, count, sum, avg

In [15]:
# Calculate purchase frequency per category for each customer
category_count_df = (
    trans_df.groupBy("customer_id", "product_category").agg(count("product_category").alias("purchase_count"))
)
print("purchase frequency per category for each customer")
category_count_df.show()

purchase frequency per category for each customer
+-----------+----------------+--------------+
|customer_id|product_category|purchase_count|
+-----------+----------------+--------------+
|       C125|           Books|             1|
|       C124|     Electronics|             1|
|       C123|       Furniture|             1|
|       C124|        Clothing|             1|
|       C126|           Books|             1|
|       C127|        Clothing|             1|
|       C128|        Clothing|             1|
|       C125| Home Appliances|             1|
|       C123|     Electronics|             2|
+-----------+----------------+--------------+



In [17]:
from pyspark.sql.functions import col, count, sum, avg, row_number, desc
from pyspark.sql.window import Window

In [18]:
# Determine the most frequently bought category for each customer
favorite_category_df = (
    category_count_df
    .withColumn("rank", row_number().over(Window.partitionBy("customer_id").orderBy(desc("purchase_count"))))
    .filter(col("rank") == 1)
    .select("customer_id", "product_category")
)
print("most frequently bought category for each customer")
favorite_category_df.show()

most frequently bought category for each customer
+-----------+----------------+
|customer_id|product_category|
+-----------+----------------+
|       C123|     Electronics|
|       C124|     Electronics|
|       C125|           Books|
|       C126|           Books|
|       C127|        Clothing|
|       C128|        Clothing|
+-----------+----------------+



In [19]:
# Collect and update each customer's favorite category in DynamoDB
for row in favorite_category_df.collect():
    table.update_item(
        Key={"customer_id": row["customer_id"]},
        UpdateExpression="SET favorite_category = :category",
        ExpressionAttributeValues={":category": row["product_category"]}
    )

In [20]:
print("Favorite product category updated for each customer in DynamoDB.")


Favorite product category updated for each customer in DynamoDB.


In [None]:
spark.stop()