# Google Colab Setup

In [1]:
!pip install pyspark



### Installing MondoDB Client

In [3]:
!pip install pymongo



### Configure Spark with MongoDB Connector

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BigDataAssignment02") \
    .config("spark.mongodb.write.connection.uri", "mongodb+srv://sandanisenevirathna2002_db_user@admin:sandanisenevirathna2002_db_user@cluster1.acexhyt.mongodb.net/") \
    .config("spark.mongodb.read.connection.uri", "mongodb+srv://sandanisenevirathna2002_db_user@admin:sandanisenevirathna2002_db_user@cluster1.acexhyt.mongodb.net/") \
    .getOrCreate()


### Store MongoDB URI using environment variables

In [5]:
import os
os.environ["MONGO_URI"] = "mongodb+srv://sandanisenevirathna2002_db_user@admin:sandanisenevirathna2002_db_user@cluster1.acexhyt.mongodb.net/"

# TASK 1 – DATA INGESTION (BRONZE LAYER)

### Mount google drive

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load dataset

In [7]:
df_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/content/drive/MyDrive/BigData /Online Retail.csv")

### Inspect schema

In [8]:
df_raw.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



### Record count

In [9]:
df_raw.count()

541909

### Add year and month

In [10]:
from pyspark.sql.functions import year, month, to_timestamp

df_bronze = df_raw.withColumn(
    "InvoiceDate", to_timestamp("InvoiceDate")
).withColumn(
    "year", year("InvoiceDate")
).withColumn(
    "month", month("InvoiceDate")
)

### Write Bronze Layer

In [11]:
df_bronze.write \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .parquet("/content/bronze")

# TASK 2 – DATA CLEANING & QUALITY (SILVER)

### Remove cancelled invoices

In [12]:
df_clean = df_bronze.filter(df_bronze.InvoiceNo.startswith("C"))

### Handle missing CustomerID

In [13]:
df_clean = df_clean.dropna(subset=["CustomerID"])
df_clean.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----+-----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|year|month|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----+-----+
|  C536379|        D|            Discount|      -1|2010-12-01 09:41:00|     27.5|     14527|United Kingdom|2010|   12|
|  C536383|   35004C|SET OF 3 COLOURED...|      -1|2010-12-01 09:49:00|     4.65|     15311|United Kingdom|2010|   12|
|  C536391|    22556|PLASTERS IN TIN C...|     -12|2010-12-01 10:24:00|     1.65|     17548|United Kingdom|2010|   12|
|  C536391|    21984|PACK OF 12 PINK P...|     -24|2010-12-01 10:24:00|     0.29|     17548|United Kingdom|2010|   12|
|  C536391|    21983|PACK OF 12 BLUE P...|     -24|2010-12-01 10:24:00|     0.29|     17548|United Kingdom|2010|   12|
|  C536391|    21980|PACK OF 12 RED RE...|     -

### Handle negative quantities (returns)

In [14]:
df_clean = df_clean.filter(df_clean.Quantity > 0)
df_clean.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



### Remove invalid prices

In [15]:
df_clean = df_clean.filter(df_clean.UnitPrice > 0)
df_clean.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



### Remove duplicates

In [16]:
df_clean = df_clean.dropDuplicates()
df_clean.show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



### Data Quality Report

In [17]:
from pyspark.sql.functions import col, count, isnan, when
from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType

exprs = []

for field in df_clean.schema.fields:
    c = field.name
    dtype = field.dataType

    if isinstance(dtype, (DoubleType, FloatType)):
        # Numeric columns → NULL or NaN
        exprs.append(
            count(
                when(col(c).isNull() | isnan(col(c)), c)
            ).alias(c)
        )
    else:
        # Non-numeric columns → NULL only
        exprs.append(
            count(
                when(col(c).isNull(), c)
            ).alias(c)
        )

quality_report = df_clean.select(exprs)
quality_report.show()


+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|year|month|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+
|        0|        0|          0|       0|          0|        0|         0|      0|   0|    0|
+---------+---------+-----------+--------+-----------+---------+----------+-------+----+-----+



# TASK 3 – FEATURE ENGINEERING

### Revenue Feature

In [18]:
from pyspark.sql.functions import hour, dayofweek

df_feat = df_clean.withColumn(
    "revenue", df_clean.Quantity * df_clean.UnitPrice
)

### Time-based Features

In [19]:
df_feat = df_feat.withColumn("hour", hour("InvoiceDate")) \
                 .withColumn("weekday", dayofweek("InvoiceDate"))

### Basket-Level Features

In [20]:
from pyspark.sql.functions import sum as _sum, countDistinct

basket_metrics = df_feat.groupBy("InvoiceNo").agg(
    _sum("revenue").alias("invoice_total"),
    countDistinct("StockCode").alias("items_per_invoice")
)

### RFM Features (Customer)

In [21]:
from pyspark.sql.functions import max, datediff, current_date

rfm = df_feat.groupBy("CustomerID").agg(
    datediff(current_date(), max("InvoiceDate")).alias("recency"),
    countDistinct("InvoiceNo").alias("frequency"),
    _sum("revenue").alias("monetary")
)

### Window Function Example

In [22]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

window_spec = Window.partitionBy("CustomerID")

df_feat = df_feat.withColumn(
    "customer_total_spend",
    sum("revenue").over(window_spec)
)

# TASK 4 – MONGODB DATA MODELING (GOLD)

In [34]:
from pyspark.sql.functions import collect_list, struct, sum as _sum

fact_invoices = df_feat.groupBy(
    "InvoiceNo", "InvoiceDate", "CustomerID", "Country"
).agg(
    _sum("revenue").alias("total_invoice_value"),
    collect_list(
        struct(
            "StockCode",
            "Description",
            "Quantity",
            "UnitPrice",
            "revenue"
        )
    ).alias("items")
)

In [35]:
fact_invoices.write \
    .format("mongodb") \
    .mode("overwrite") \
    .option("collection", "fact_invoices") \
    .save()

Py4JJavaError: An error occurred while calling o482.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: mongodb. Make sure the provider name is correct and the package is properly registered and compatible with your Spark version. SQLSTATE: 42K02
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:722)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:681)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:740)
	at org.apache.spark.sql.classic.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:626)
	at org.apache.spark.sql.classic.DataFrameWriter.saveInternal(DataFrameWriter.scala:135)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:126)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: mongodb.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$6(DataSource.scala:665)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:665)
	at scala.util.Failure.orElse(Try.scala:230)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:665)
	... 16 more
