In [3]:
!pip install pyspark -q
!pip install findspark -q

In [4]:
# Import the cleaned dataset (gold layer)
from google.colab import drive
drive.mount('/content/drive')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LoadGold").getOrCreate()

df = (spark.read
          .option("header", True)
          .option("inferSchema", True)
          .option("sep", ";")
          .csv('/content/drive/MyDrive/datasets/gold_fact_events.csv')
)
df.printSchema()
df.show(5, truncate=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
root
 |-- event_ts: timestamp (nullable = true)
 |-- visitor_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- is_available: integer (nullable = true)
 |-- event_type: string (nullable = true)

+-------------------+----------+-------+-----------+------------+----------+
|event_ts           |visitor_id|item_id|category_id|is_available|event_type|
+-------------------+----------+-------+-----------+------------+----------+
|2015-05-03 03:00:04|693516    |297662 |-1         |0           |addtocart |
|2015-05-03 03:00:11|829044    |60987  |-1         |0           |view      |
|2015-05-03 03:00:13|652699    |252860 |-1         |0           |view      |
|2015-05-03 03:00:24|1125936   |33661  |-1         |0           |view      |
|2015-05-03 03:00:26|693516    |297662 |-1         |0           |view

In [6]:
# Df exploration

df.printSchema()

total = df.count()
print(f"Number of events : {total}")

root
 |-- event_ts: timestamp (nullable = true)
 |-- visitor_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- is_available: integer (nullable = true)
 |-- event_type: string (nullable = true)

Nombre total d'événements : 2751205


In [11]:
# Mean, min, max, standard deviation for the num column
df.describe("item_id", "category_id", "is_available").show()

# ts min and max
from pyspark.sql import functions as F
df.select(
    F.min("event_ts").alias("min_ts"),
    F.max("event_ts").alias("max_ts")
).show()

+-------+------------------+-----------------+-------------------+
|summary|           item_id|      category_id|       is_available|
+-------+------------------+-----------------+-------------------+
|  count|           2751205|          2751205|            2751205|
|   mean| 234925.8418725613|429.5796194758297| 0.2019471467956768|
| stddev|134193.37095484437|549.6133797863472|0.40145305488485816|
|    min|                 3|               -1|                  0|
|    max|            466867|             1697|                  1|
+-------+------------------+-----------------+-------------------+

+-------------------+-------------------+
|             min_ts|             max_ts|
+-------------------+-------------------+
|2015-05-03 03:00:04|2015-09-18 02:59:47|
+-------------------+-------------------+



In [14]:
# Missing values
from pyspark.sql import functions as F

missing = df.select([
    F.count(F.when(
        (F.col(c).isNull()) | (F.trim(F.col(c)) == ""),
        c
    )).alias(c)
    for c in df.columns
])
missing.show()

+--------+----------+-------+-----------+------------+----------+
|event_ts|visitor_id|item_id|category_id|is_available|event_type|
+--------+----------+-------+-----------+------------+----------+
|       0|         0|      0|          0|           0|         0|
+--------+----------+-------+-----------+------------+----------+



In [15]:
# Duplicate values
duplicates = (
    df
      .groupBy("event_ts", "visitor_id", "item_id")
      .count()
      .filter("count > 1")
)
print("Number of duplicate values :", duplicates.count())
duplicates.show(5, truncate=False)

Number of duplicate values : 0
+--------+----------+-------+-----+
|event_ts|visitor_id|item_id|count|
+--------+----------+-------+-----+
+--------+----------+-------+-----+

