In [2]:
import os
import sys
import spark_utils as sut

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
spark = sut.get_spark_session()

# Create a synthetic dataset

In [3]:
# Define the schema
schema = StructType([
    StructField("ID", StringType(), False),
    StructField("amount", DoubleType(), False),
    StructField("quantity", IntegerType(), False)
])

# Create sample data
data = [
    ("A001", 100.68, 5),    
    ("A001", 150.33, 3),
    ("A001", 799.99, 4),
    
    ("A002", 200.45, 2),    
    ("A002", 300.89, 4),
    ("A002", 899.66, 3),
    
    ("A003", 250.77, 6),    
    ("A003", 400.13, 3),
    ("A003", 999.11, 5)
]

# Create the Spark DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()

+----+------+--------+
|  ID|amount|quantity|
+----+------+--------+
|A001|100.68|       5|
|A001|150.33|       3|
|A001|799.99|       4|
|A002|200.45|       2|
|A002|300.89|       4|
|A002|899.66|       3|
|A003|250.77|       6|
|A003|400.13|       3|
|A003|999.11|       5|
+----+------+--------+



# Grouping and Aggregations

In [4]:
df_by_id = df.groupBy(df.ID).agg(
 F.mean('amount').alias('mean_amount'),
 F.median('amount').alias('median_amount'), 
 F.min('amount').alias('min_amount'),
 F.max('amount').alias('max_amount'),
 F.count('amount').alias('count_amount'),
 F.count_distinct('amount').alias('count_distinct_amount'),
 F.sum('amount').alias('sum_amount'),
 F.sum_distinct('amount').alias('sum_distinct_amount')
)

In [5]:
df_by_id.show()

+----+-----------------+-------------+----------+----------+------------+---------------------+----------+-------------------+
|  ID|      mean_amount|median_amount|min_amount|max_amount|count_amount|count_distinct_amount|sum_amount|sum_distinct_amount|
+----+-----------------+-------------+----------+----------+------------+---------------------+----------+-------------------+
|A003|550.0033333333333|       400.13|    250.77|    999.11|           3|                    3|   1650.01|            1650.01|
|A002|            467.0|       300.89|    200.45|    899.66|           3|                    3|    1401.0|             1401.0|
|A001|350.3333333333333|       150.33|    100.68|    799.99|           3|                    3|    1051.0|             1051.0|
+----+-----------------+-------------+----------+----------+------------+---------------------+----------+-------------------+



In [6]:
sut.round_numeric_cols(df_by_id).show()

+----+-----------+-------------+----------+----------+------------+---------------------+----------+-------------------+
|  ID|mean_amount|median_amount|min_amount|max_amount|count_amount|count_distinct_amount|sum_amount|sum_distinct_amount|
+----+-----------+-------------+----------+----------+------------+---------------------+----------+-------------------+
|A003|      550.0|       400.13|    250.77|    999.11|           3|                    3|   1650.01|            1650.01|
|A002|      467.0|       300.89|    200.45|    899.66|           3|                    3|    1401.0|             1401.0|
|A001|     350.33|       150.33|    100.68|    799.99|           3|                    3|    1051.0|             1051.0|
+----+-----------+-------------+----------+----------+------------+---------------------+----------+-------------------+



# Droppinga Columns

In [7]:
cols_to_drop = ['count_amount', 'count_distinct_amount', 'sum_distinct_amount']
df_by_id.drop(*cols_to_drop).show()

+----+-----------------+-------------+----------+----------+----------+
|  ID|      mean_amount|median_amount|min_amount|max_amount|sum_amount|
+----+-----------------+-------------+----------+----------+----------+
|A001|350.3333333333333|       150.33|    100.68|    799.99|    1051.0|
|A002|            467.0|       300.89|    200.45|    899.66|    1401.0|
|A003|550.0033333333333|       400.13|    250.77|    999.11|   1650.01|
+----+-----------------+-------------+----------+----------+----------+



# Joining Tables

In [8]:
# Create dimension table with start dates
dim_schema = StructType([
    StructField("ID", StringType(), False),
    StructField("start_date", StringType(), False)
])

dim_data = [
    ("A001", "15/01/2022"),
    ("A002", "30/06/2022"),
    ("A003", "01/03/2023")
]

dim_df = spark.createDataFrame(dim_data, dim_schema)

# Simple join of the two tables
df_with_dates = df.join(
    other=dim_df,
    on="ID",
    how="left"
)

# Show results
df_with_dates.show()

+----+------+--------+----------+
|  ID|amount|quantity|start_date|
+----+------+--------+----------+
|A001|100.68|       5|15/01/2022|
|A001|150.33|       3|15/01/2022|
|A001|799.99|       4|15/01/2022|
|A002|200.45|       2|30/06/2022|
|A002|300.89|       4|30/06/2022|
|A002|899.66|       3|30/06/2022|
|A003|250.77|       6|01/03/2023|
|A003|400.13|       3|01/03/2023|
|A003|999.11|       5|01/03/2023|
+----+------+--------+----------+



# Creating New Columns

In [9]:
df_with_dates = sut.transform_date_cols(
    df=df_with_dates,
    date_cols=['start_date'],
    str_date_format='dd/MM/yyyy'
)

In [10]:
sut.print_schema_alphabetically(df_with_dates)

root
 |-- ID: string (nullable = false)
 |-- amount: double (nullable = false)
 |-- quantity: integer (nullable = false)
 |-- start_date: date (nullable = true)



In [17]:
df_with_dates = df_with_dates.withColumn('total', F.col('amount') * F.col('quantity')) \
       .withColumn('tenure_in_days', F.datediff(F.current_date(), F.col('start_date'))) \
       .withColumn('tenure_in_months', F.months_between(F.current_date(), F.col('start_date'))) \
       .withColumn('tenure_in_years', F.col('tenure_in_months') / 12)

df_with_dates.show()

+----+------+--------+----------+------------------+--------------+----------------+------------------+
|  ID|amount|quantity|start_date|             total|tenure_in_days|tenure_in_months|   tenure_in_years|
+----+------+--------+----------+------------------+--------------+----------------+------------------+
|A001|100.68|       5|2022-01-15|503.40000000000003|          1133|     37.19354839| 3.099462365833333|
|A001|150.33|       3|2022-01-15|            450.99|          1133|     37.19354839| 3.099462365833333|
|A001|799.99|       4|2022-01-15|           3199.96|          1133|     37.19354839| 3.099462365833333|
|A002|200.45|       2|2022-06-30|             400.9|           967|     31.70967742| 2.642473118333333|
|A002|300.89|       4|2022-06-30|           1203.56|           967|     31.70967742| 2.642473118333333|
|A002|899.66|       3|2022-06-30|           2698.98|           967|     31.70967742| 2.642473118333333|
|A003|250.77|       6|2023-03-01|1504.6200000000001|           7

In [18]:
sut.print_schema_alphabetically(df_with_dates)

root
 |-- ID: string (nullable = false)
 |-- amount: double (nullable = true)
 |-- quantity: integer (nullable = false)
 |-- start_date: date (nullable = true)
 |-- tenure_in_days: integer (nullable = true)
 |-- tenure_in_months: double (nullable = true)
 |-- tenure_in_years: double (nullable = true)
 |-- total: double (nullable = true)



In [19]:
cols_to_round = ['total','tenure_in_months']
df_with_dates = sut.round_given_cols(df_with_dates, cols_to_round)
df_with_dates.show()

+----+------+--------+----------+-------+--------------+----------------+------------------+
|  ID|amount|quantity|start_date|  total|tenure_in_days|tenure_in_months|   tenure_in_years|
+----+------+--------+----------+-------+--------------+----------------+------------------+
|A001|100.68|       5|2022-01-15|  503.4|          1133|           37.19| 3.099462365833333|
|A001|150.33|       3|2022-01-15| 450.99|          1133|           37.19| 3.099462365833333|
|A001|799.99|       4|2022-01-15|3199.96|          1133|           37.19| 3.099462365833333|
|A002|200.45|       2|2022-06-30|  400.9|           967|           31.71| 2.642473118333333|
|A002|300.89|       4|2022-06-30|1203.56|           967|           31.71| 2.642473118333333|
|A002|899.66|       3|2022-06-30|2698.98|           967|           31.71| 2.642473118333333|
|A003|250.77|       6|2023-03-01|1504.62|           723|           23.65|1.9704301075000001|
|A003|400.13|       3|2023-03-01|1200.39|           723|           23.