# Google Colab Setup

In [None]:
!pip install pyspark



### Installing MondoDB Client

In [None]:
!pip install pymongo



### Configure Spark with MongoDB Connector

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BigDataAssignment02") \
    .config("spark.mongodb.write.connection.uri", "mongodb+srv://sandanisenevirathna2002_db_user@admin:sandanisenevirathna2002_db_user@cluster1.acexhyt.mongodb.net/") \
    .config("spark.mongodb.read.connection.uri", "mongodb+srv://sandanisenevirathna2002_db_user@admin:sandanisenevirathna2002_db_user@cluster1.acexhyt.mongodb.net/") \
    .getOrCreate()


### Store MongoDB URI using environment variables

In [None]:
import os
os.environ["MONGO_URI"] = "mongodb+srv://sandanisenevirathna2002_db_user@admin:sandanisenevirathna2002_db_user@cluster1.acexhyt.mongodb.net/"

# TASK 1 – DATA INGESTION (BRONZE LAYER)

### Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load dataset

In [None]:
df_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/content/drive/MyDrive/BigData /Online Retail.csv")

### Inspect schema

In [None]:
df_raw.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



### Record count

In [None]:
df_raw.count()

541909

### Add year and month

In [None]:
from pyspark.sql.functions import year, month, to_timestamp

df_bronze = df_raw.withColumn(
    "InvoiceDate", to_timestamp("InvoiceDate")
).withColumn(
    "year", year("InvoiceDate")
).withColumn(
    "month", month("InvoiceDate")
)

### Write Bronze Layer

In [None]:
df_bronze.write \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .parquet("/content/bronze")

# TASK 2 – DATA CLEANING & QUALITY (SILVER)

### Remove cancelled invoices

In [None]:
df_clean = df_bronze.filter(df_bronze.InvoiceNo.startswith("C"))

### Handle missing CustomerID

In [None]:
df_clean = df_clean.dropna(subset=["CustomerID"])
df_clean.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----+-----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|year|month|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----+-----+
|  C536379|        D|            Discount|      -1|2010-12-01 09:41:00|     27.5|     14527|United Kingdom|2010|   12|
|  C536383|   35004C|SET OF 3 COLOURED...|      -1|2010-12-01 09:49:00|     4.65|     15311|United Kingdom|2010|   12|
|  C536391|    22556|PLASTERS IN TIN C...|     -12|2010-12-01 10:24:00|     1.65|     17548|United Kingdom|2010|   12|
|  C536391|    21984|PACK OF 12 PINK P...|     -24|2010-12-01 10:24:00|     0.29|     17548|United Kingdom|2010|   12|
|  C536391|    21983|PACK OF 12 BLUE P...|     -24|2010-12-01 10:24:00|     0.29|     17548|United Kingdom|2010|   12|
|  C536391|    21980|PACK OF 12 RED RE...|     -

### Handle negative quantities (returns)

In [None]:
df_clean = df_clean.filter(df_clean.Quantity > 0)
df_clean.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



### Remove invalid prices

In [None]:
df_clean = df_clean.filter(df_clean.UnitPrice > 0)
df_clean.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



### Remove duplicates

In [None]:
df_clean = df_clean.dropDuplicates()
df_clean.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



### Data Quality Report

In [None]:
from pyspark.sql.functions import col, count, isnan, when
from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType

exprs = []

for field in df_clean.schema.fields:
    c = field.name
    dtype = field.dataType

    if isinstance(dtype, (DoubleType, FloatType)):
        # Numeric columns → NULL or NaN
        exprs.append(
            count(
                when(col(c).isNull() | isnan(col(c)), c)
            ).alias(c)
        )
    else:
        # Non-numeric columns → NULL only
        exprs.append(
            count(
                when(col(c).isNull(), c)
            ).alias(c)
        )

quality_report = df_clean.select(exprs)
quality_report.show()


+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|        0|        0|          0|       0|          0|        0|         0|      0|   0|    0|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



# TASK 3 – FEATURE ENGINEERING

### Revenue Feature

In [None]:
from pyspark.sql.functions import hour, dayofweek

df_feat = df_clean.withColumn(
    "revenue", df_clean.Quantity * df_clean.UnitPrice
)

### Time-based Features

In [None]:
df_feat = df_feat.withColumn("hour", hour("InvoiceDate")) \
                 .withColumn("weekday", dayofweek("InvoiceDate"))

### Basket-Level Features

In [None]:
from pyspark.sql.functions import sum as _sum, countDistinct

basket_metrics = df_feat.groupBy("InvoiceNo").agg(
    _sum("revenue").alias("invoice_total"),
    countDistinct("StockCode").alias("items_per_invoice")
)

### RFM Features (Customer)

In [None]:
from pyspark.sql.functions import max, datediff, current_date

rfm = df_feat.groupBy("CustomerID").agg(
    datediff(current_date(), max("InvoiceDate")).alias("recency"),
    countDistinct("InvoiceNo").alias("frequency"),
    _sum("revenue").alias("monetary")
)

### Window Function Example

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

window_spec = Window.partitionBy("CustomerID")

df_feat = df_feat.withColumn(
    "customer_total_spend",
    sum("revenue").over(window_spec)
)

# TASK 4 – MONGODB DATA MODELING (GOLD)