In [12]:
import warnings
warnings.filterwarnings("ignore")


In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,count,sum,avg,month,year

In [14]:
pg_url = 'jdbc:postgresql://localhost:5432/Ecommerce_Sales_Analysis'
pg_properties = {
    'user':'postgres',
    'password':'pgroot',
    'driver':'org.postgresql.Driver'
}

In [15]:
spark = SparkSession.builder\
.appName('EcommerceSalesAnalysis')\
.config('spark.jars','/home/krish/postgresql-42.7.5.jar')\
.getOrCreate()

In [4]:
orders_df = spark.read.jdbc(pg_url,'orders',properties=pg_properties)
products_df = spark.read.jdbc(pg_url,'products',properties=pg_properties)
customers_df=spark.read.jdbc(pg_url,'customers',properties=pg_properties)

In [5]:
# Top selling products

top_products = orders_df.groupBy('product_id')\
.agg(count('order_id').alias('total_orders'))\
.orderBy(col('total_orders').desc())


In [6]:
# Repeat Customers (Customers with more than 5 orders)

repeat_customers=orders_df.groupBy('customer_id')\
.agg(count('order_id').alias('order_count'))\
.filter(col('order_count')>5)

In [7]:
# Monthly revenue trends

monthly_revenue = orders_df.withColumn('month',month(col('order_date')))\
.withColumn('year',year(col('order_date')))\
.groupBy('year','month')\
.agg(sum('total_amount').alias('monthly_revenue'))\
.orderBy('year','month')

In [8]:
print('Top Selling Products: ')
top_products.show(5)

Top Selling Products: 
+----------+------------+
|product_id|total_orders|
+----------+------------+
|         1|           3|
|         3|           3|
|         5|           3|
|         4|           3|
|         2|           3|
+----------+------------+



In [9]:
print('Repeat customers: ')
repeat_customers.show(5)

Repeat customers: 
+-----------+-----------+
|customer_id|order_count|
+-----------+-----------+
+-----------+-----------+



In [10]:
print('Monthly Revenue Trends: ')
monthly_revenue.show(5)

Monthly Revenue Trends: 
+----+-----+---------------+
|year|month|monthly_revenue|
+----+-----+---------------+
|2024|    2|        6460.00|
+----+-----+---------------+



In [11]:
top_products.write.jdbc(pg_url,'top_products',mode='overwrite',properties=pg_properties)
repeat_customers.write.jdbc(pg_url,'repeat_customers',mode='overwrite',properties=pg_properties)
monthly_revenue.write.jdbc(pg_url,'monthly_revenue',mode='overwrite',properties=pg_properties)

In [16]:
spark.stop()