# Pyspark

In [None]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, year
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("MonitorAnalysis")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901") \
.config("spark.hadoop.fs.s3a.access.key", "ACCESS_KEY")\
    .config("spark.hadoop.fs.s3a.secret.key", "SECRET_KEY")\
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.metastore.metrics.enabled", "false") \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .config("spark.executor.memory", "4g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.hadoop.fs.s3a.region", "ap-south-1") \
    .getOrCreate()

**OS Setup**

In [None]:
import os

os.environ['AWS_ACCESS_KEY_ID'] = 'ACCESS_KEY'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'SECRET_KEY'
os.environ['AWS_DEFAULT_REGION'] = 'ap-south-1'

**Boto Setup**

In [3]:
pip install boto3



Boto

In [None]:
import boto3

# Create a session with your credentials
session = boto3.Session(
    aws_access_key_id='ACCESS_KEY',
    aws_secret_access_key='SECRET_KEY',
    region_name='ap-south-1'  # e.g., 'us-east-1'
)

dynamodb = session.resource('dynamodb')
table = dynamodb.Table('customers')

# RetailMart wants to analyze churn by monitoring inactive accounts

In [4]:
response = table.scan()
cust_data = response['Items']

In [5]:
# Convert the DynamoDB items to DataFrame
cust_df = spark.createDataFrame(cust_data)

In [6]:
# Load data from s3
sales_input = "s3a://this-is-my-bucket007/transactions.csv"
trans_df = spark.read.csv(sales_input, header=True, inferSchema=True)


In [7]:
print("Customer Data from DynamoDB:")
cust_df.show()
print("Transaction Data from S3:")
trans_df.show()

Customer Data from DynamoDB:
+-----------+-------------+-----------------+--------------+
|customer_id|customer_name|favorite_category|      inactive|
+-----------+-------------+-----------------+--------------+
|       C125|        manoj|            Books|{BOOL -> true}|
|       C128|         NULL|         Clothing|          NULL|
|       C126|          Leo|            Books|{BOOL -> true}|
|       C123|         John|      Electronics|{BOOL -> true}|
|       C127|         NULL|         Clothing|          NULL|
|       C124|        brock|      Electronics|{BOOL -> true}|
+-----------+-------------+-----------------+--------------+

Transaction Data from S3:
+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_date|amount|product_category|
+--------------+-----------+----------------+------+----------------+
|          T001|       C123|      2024-01-15|  1200|     Electronics|
|          T002|       C124|      2024-01-16|   800|  

In [9]:
from datetime import datetime, timedelta

In [10]:
# Get today's date and calculate the cutoff date (1 year ago)
one_year_ago = datetime.now() - timedelta(days=365)

In [12]:
from pyspark.sql.functions import max

In [13]:
# Calculate the last transaction date for each customer
last_trans_df = (
    trans_df
    .groupBy("customer_id")
    .agg(max("transaction_date").alias("last_transaction_date"))
)
print("last transaction date for each customer")
last_trans_df.show()

last transaction date for each customer
+-----------+---------------------+
|customer_id|last_transaction_date|
+-----------+---------------------+
|       C128|           2024-01-24|
|       C123|           2024-01-21|
|       C126|           2024-01-19|
|       C124|           2024-01-23|
|       C125|           2024-01-22|
|       C127|           2024-01-20|
+-----------+---------------------+



In [14]:
# Join customer data with last transaction data
cust_act_df = cust_df.join(last_trans_df, on="customer_id", how="left")


In [21]:
from pyspark.sql.functions import col, lit, year, month

In [16]:
# Flag customers as inactive if they have no transactions in the last year
inactive_cust_df = cust_act_df.withColumn(
    "inactive", (col("last_transaction_date") < lit(one_year_ago))
)

In [17]:
# Filter for inactive customers
inactive_cust_df = inactive_cust_df.filter(col("inactive") == True)
print("inactive customers")
inactive_cust_df.show()

inactive customers
+-----------+-------------+-----------------+--------+---------------------+
|customer_id|customer_name|favorite_category|inactive|last_transaction_date|
+-----------+-------------+-----------------+--------+---------------------+
+-----------+-------------+-----------------+--------+---------------------+



In [19]:
from pyspark.sql.functions import count

In [22]:
# Calculate churn rate by month
churn_by_month_df = (
    inactive_cust_df
    .withColumn("churn_year", year(col("last_transaction_date")))
    .withColumn("churn_month", month(col("last_transaction_date")))
    .groupBy("churn_year", "churn_month")
    .agg(count("customer_id").alias("churn_count"))
    .orderBy("churn_year", "churn_month")
)

In [23]:
churn_by_month_df.show()

+----------+-----------+-----------+
|churn_year|churn_month|churn_count|
+----------+-----------+-----------+
+----------+-----------+-----------+

