# Pyspark setup

In [None]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, year
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("CustomerAnalysis")\
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901") \
    .config("spark.hadoop.fs.s3a.access.key", "ACCESS_KEY")\
    .config("spark.hadoop.fs.s3a.secret.key", "SECRET_KEY")\
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.metastore.metrics.enabled", "false") \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .config("spark.executor.memory", "4g")\
    .config("spark.driver.memory", "4g")\
    .config("spark.hadoop.fs.s3a.region", "ap-south-1") \
    .getOrCreate()

# categorize customers into tiers (e.g., "Bronze," "Silver," "Gold") based on total purchase value

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CustomerTierAnalysis") \
    .config("spark.hadoop.fs.s3a.region", "us-west-2") \  # Set your desired region
    .getOrCreate()

In [None]:
# Load transaction data from S3
transactions_df = spark.read.csv("s3://your-bucket/transactions.csv", header=True, inferSchema=True)

# Show initial rows to verify data
transactions_df.show()

In [None]:
import boto3
from pyspark.sql import Row

# Initialize a session using Boto3
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('Customers')

# Scan the table and load data into a list
response = table.scan()
customers_data = response['Items']

# Define schema for customer data (adjust fields as necessary)
schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("name", StringType(), True),
])

# Check if customers_data is empty and create DataFrame accordingly
if customers_data:
    customers_df = spark.createDataFrame([Row(**item) for item in customers_data])
else:
    customers_df = spark.createDataFrame([], schema)

# Show the DataFrame schema and data (if any)
customers_df.printSchema()
customers_df.show()

In [None]:
# Join on customer_id
joined_df = customers_df.join(transactions_df, customers_df.customer_id == transactions_df.customer_id, "inner")

**Total Spending per Customer**

In [None]:
from pyspark.sql.functions import sum, col

# Calculate total spending per customer
total_spending_df = joined_df.groupBy("customer_id").agg(sum("amount").alias("total_spending"))

# Show total spending results
total_spending_df.show()

In [None]:
from pyspark.sql.functions import when

# Define thresholds for tiers
tiered_customers_df = total_spending_df.withColumn(
    "tier",
    when(col("total_spending") < 1000, "Bronze")
    .when((col("total_spending") >= 1000) & (col("total_spending") < 5000), "Silver")
    .when(col("total_spending") >= 5000, "Gold")
)


# Show tiered customer results
tiered_customers_df.show()

In [None]:
import json

# Convert DataFrame to list of dictionaries for DynamoDB batch write
tiered_customers_data = tiered_customers_df.collect()

# Prepare batch write for DynamoDB
with table.batch_writer() as batch:
    for row in tiered_customers_data:
        batch.put_item(Item={
            'customer_id': row.customer_id,
            'total_spending': row.total_spending,
            'tier': row.tier,
            # Include other fields as necessary
        })