# Pyspark setup

In [None]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, year
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("CustomerAnalysis")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901") \
    .config("spark.hadoop.fs.s3a.access.key", "ACCESS_KEY")\
    .config("spark.hadoop.fs.s3a.secret.key", "SECRET_KEY")\
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.metastore.metrics.enabled", "false") \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .config("spark.executor.memory", "4g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.hadoop.fs.s3a.region", "ap-south-1") \
    .getOrCreate()

In [2]:
# Load order data from S3
orders_df = spark.read.csv("s3a://this-is-my-bucket007/order_data.csv", header=True, inferSchema=True)

# Show initial rows to verify data
orders_df.show()

+--------+-----------+----------+----------------+------+
|order_id|customer_id|order_date|product_category|amount|
+--------+-----------+----------+----------------+------+
|       1|        101|10/01/2024|     Electronics|  1200|
|       2|        102|15/01/2024|         Apparel|   850|
|       3|        103|18/01/2024|      Home Decor|  1100|
|       4|        101|20/01/2024|     Electronics|   500|
|       5|        104|22/01/2024|          Sports|  1300|
+--------+-----------+----------+----------------+------+



# Retailmart with DynamoDB

In [None]:
pip install boto3

In [None]:
import os

os.environ['AWS_ACCESS_KEY_ID'] = 'ACCESS_KEY'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'SECRET_KEY'
os.environ['AWS_DEFAULT_REGION'] = 'ap-south-1'  # e.g., 'us-east-1'

**Boto Session**

In [None]:
import boto3

# Create a session with your credentials
session = boto3.Session(
    aws_access_key_id='ACCESS_KEY',
    aws_secret_access_key='SECRET_KEY',
    region_name='ap-south-1'  # e.g., 'us-east-1'
)

dynamodb = session.resource('dynamodb')
table = dynamodb.Table('Customers')

In [32]:
import boto3
from pyspark.sql import Row

# Initialize a session using Boto3
dynamodb = boto3.resource('dynamodb',region_name='ap-south-1')
table = dynamodb.Table('Customers')

# Scan the table and load data into a list
response = table.scan()
customers_data = response['Items']

# Convert to DataFrame
customers_df = spark.createDataFrame([Row(**item) for item in customers_data]).alias("t")

In [33]:
# Load transaction data from S3
transactions_df = spark.read.csv("s3a://this-is-my-bucket007/transactions.csv", header=True, inferSchema=True).alias("t1")

# Show initial rows to verify data
transactions_df.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_date|amount|product_category|
+--------------+-----------+----------------+------+----------------+
|          T001|       C123|      2024-01-15|  1200|     Electronics|
|          T002|       C124|      2024-01-16|   800|        Clothing|
|          T003|       C125|      2024-01-17|  1500| Home Appliances|
|          T004|       C123|      2024-01-18|  2000|     Electronics|
|          T005|       C126|      2024-01-19|   500|           Books|
|          T006|       C127|      2024-01-20|   300|        Clothing|
|          T007|       C123|      2024-01-21|  4000|       Furniture|
|          T008|       C125|      2024-01-22|   200|           Books|
|          T009|       C124|      2024-01-23|  1800|     Electronics|
|          T010|       C128|      2024-01-24|   700|        Clothing|
+--------------+-----------+----------------+------+----------------+



In [35]:
# Join on customer_id
joined_df = customers_df.join(transactions_df, customers_df.id == transactions_df.customer_id, "inner").alias("t2")

In [37]:
from pyspark.sql.functions import count

# Count repeat purchases per customer
repeat_purchases_df = joined_df.groupBy("customer_id").agg(count("transaction_id").alias("repeat_purchase_count"))

# Show results
repeat_purchases_df.show()

+-----------+---------------------+
|customer_id|repeat_purchase_count|
+-----------+---------------------+
+-----------+---------------------+



In [38]:
# Sort by repeat purchase count in descending order
top_repeat_customers_df = repeat_purchases_df.orderBy("repeat_purchase_count", ascending=False)

# Show top repeat customers
top_repeat_customers_df.show()

+-----------+---------------------+
|customer_id|repeat_purchase_count|
+-----------+---------------------+
+-----------+---------------------+



In [40]:
# Save top repeat customers data to S3 in Parquet format
top_repeat_customers_df.write.parquet("s3a://this-is-my-bucket007/top_repeat_customers.parquet")