### Libraries

In [0]:
# import libraries
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from datetime import datetime

### Configure environment

In [0]:
# Datalake connection variables
storage_account_name    = "dlptde"
container_name          = "data-pt-de"
sas_token               = "sp=racwdle&st=2024-04-24T20:31:21Z&se=2024-05-04T04:31:21Z&spr=https&sv=2022-11-02&sr=c&sig=8g%2B97QOqHn9A6UebuEV0PYbJnCBfZSg2fim%2FsprqfCU%3D"

# Apply settings
spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    sas_token
)

### Save paths

In [0]:
# Save destination paths
comun_path                  = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net"

categories_destination_path = f"{comun_path}/categories_data.csv"
levels_destination_path     = f"{comun_path}/levels_data.csv"
courses_destination_path    = f"{comun_path}/courses.parquet"

### Generate Data

In [0]:
# Set categories data
course_categories_data = [
    {
        "category_id"   : 1,
        "category_name" : "Technology"
    },
    {
        "category_id"   : 2,
        "category_name" : "Business"
    },
    {
        "category_id"   : 3,
        "category_name" : "Art"
    },
    {
        "category_id"   : 4,
        "category_name" : "Science"
    },
    {
        "category_id"   : 5,
        "category_name" : "Health"
    }
]

# Set levels data
course_levels_data = [
    {
        "level_id"      : 101,
        "level_name"    : "Beginner"
    },
    {
        "level_id"      : 102,
        "level_name"    : "Intermediate"
    },
    {
        "level_id"      : 103,
        "level_name"    : "Advanced"
    }
]

# Set courses data
courses_data = {
    "course_id"     : np.arange(1, 101),
    "course_name"   : [f"Course {i}" for i in range(1, 101)],
    "category_id"   : np.random.choice([1, 2, 3, 4, 5], 100),
    "level_id"      : np.random.choice([101, 102, 103], 100),
    "start_date"    : [datetime(2021, np.random.randint(1, 13), np.random.randint(1, 29)) for _ in range(100)]
}

# Convert to dataframe
df_categories   = spark.createDataFrame(course_categories_data)
df_levels       = spark.createDataFrame(course_levels_data)
df_courses      = spark.createDataFrame(pd.DataFrame(courses_data))

### Save data

In [0]:
# Write data into datalake
df_categories.coalesce(1).write.mode('overwrite').option("header", "true").csv(categories_destination_path)
df_levels.coalesce(1).write.mode('overwrite').option("header", "true").csv(levels_destination_path)
df_courses.coalesce(1).write.mode('overwrite').option("header", "true").parquet(courses_destination_path)