In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("BillingEngine")
         .config("spark.jars",         r".\conf\postgresql-42.7.6.jar")
         .config("spark.driver.extraClassPath", r".\conf\postgresql-42.7.6.jar")
         .getOrCreate())

In [2]:
rated = spark.read.parquet("rated_cdrs/")
rated.printSchema()


root
 |-- customer_id: string (nullable = true)
 |-- rate_plan_id: string (nullable = true)
 |-- product_code: string (nullable = true)
 |-- record_ID: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- cell_id: string (nullable = true)
 |-- technology: string (nullable = true)
 |-- caller_id: string (nullable = true)
 |-- callee_id: string (nullable = true)
 |-- duration_sec: integer (nullable = true)
 |-- rating_status: string (nullable = true)
 |-- sender_id: string (nullable = true)
 |-- receiver_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- session_duration_sec: string (nullable = true)
 |-- data_volume_mb: string (nullable = true)
 |-- batch_id: integer (nullable = true)
 |-- record_type: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- free_units: integer (nullable = true)
 |-- tier_thresh

In [3]:
rated_ok = rated.filter(rated.rating_status == "rated")

In [4]:
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col,lit

invoices = (
    rated_ok
    .groupBy("customer_id", "billing_period")
    .agg(
        # Sous-totaux monétaires
        F.sum(when(col("record_type") == "voice", col("cost")).otherwise(0)).alias("voice_total"),
        F.sum(when(col("record_type") == "sms",   col("cost")).otherwise(0)).alias("sms_total"),
        F.sum(when(col("record_type") == "data",  col("cost")).otherwise(0)).alias("data_total"),

        # Nombre d'événements
        F.sum(when(col("record_type") == "voice", 1).otherwise(0)).alias("voice_count"),
        F.sum(when(col("record_type") == "sms",   1).otherwise(0)).alias("sms_count"),
        F.sum(when(col("record_type") == "data",  1).otherwise(0)).alias("data_count"),

        # Total hors taxes
        F.sum(col("cost")).alias("amount_before_tax")
    )
)


In [5]:
TVA = 0.20
invoices = (invoices
    .withColumn("discount_pct", lit(0.05))
    .withColumn("after_discount",
                col("amount_before_tax") * (1 - col("discount_pct")))
    .withColumn("tax", col("after_discount") * lit(TVA))
    .withColumn("amount_due", col("after_discount") + col("tax"))
)

invoices.show(truncate=False)


+------------+--------------+-----------+---------+------------------+-----------+---------+----------+------------------+------------+------------------+--------------------+------------------+
|customer_id |billing_period|voice_total|sms_total|data_total        |voice_count|sms_count|data_count|amount_before_tax |discount_pct|after_discount    |tax                 |amount_due        |
+------------+--------------+-----------+---------+------------------+-----------+---------+----------+------------------+------------+------------------+--------------------+------------------+
|212640535826|2025-05       |0.0        |0.0      |0.0               |1          |0        |0         |0.0               |0.05        |0.0               |0.0                 |0.0               |
|212722744719|2025-05       |0.0        |0.0      |0.0               |1          |0        |0         |0.0               |0.05        |0.0               |0.0                 |0.0               |
|212799474898|2025-05    

In [6]:
customersDf = spark.read \
    .format("jdbc")\
    .option("url",      "jdbc:postgresql://localhost:5432/projet_spark")\
    .option("dbtable",  "customers")\
    .option("user",     "postgres")\
    .option("password", "0000")\
    .load() 

customersDf.show(5)

+------------+----------------+-----------------+------------+---------------+------+--------------------+
| customer_id|   customer_name|subscription_type|rate_plan_id|activation_date|status|              region|
+------------+----------------+-----------------+------------+---------------+------+--------------------+
|212621008730|  Fouad El Ghazi|         postpaid|      PLAN_B|     2024-09-30|active|Béni Mellal-Khénifra|
|212680058037|Soufiane Belkadi|         postpaid|      PLAN_A|     2023-10-16|active|  Rabat-Salé-Kénitra|
|212705083484|    Rania Skalli|         postpaid|      PLAN_C|     2023-09-13|active|          L'Oriental|
|212742643119|    Hind Belkadi|         postpaid|      PLAN_B|     2024-10-09|active|   Guelmim-Oued Noun|
|212625950696|  Ehab El Amrani|         postpaid|      PLAN_C|     2024-12-16|active|      Marrakech-Safi|
+------------+----------------+-----------------+------------+---------------+------+--------------------+
only showing top 5 rows



In [7]:
from pyspark.sql import functions as F

# 1) liste des mois à facturer (ex. juin 2025)
mois_courant = "2025-05"              # ou boucle sur plusieurs périodes

# 2) sous-ensemble des clients éligibles
clients_elig = (customersDf
    .filter( (F.col("status") == "active") &
             (F.col("subscription_type") == "postpaid") )
    .select("customer_id","region","rate_plan_id")
    .withColumn("billing_period", F.lit(mois_courant))
)

# 3) jointure gauche clients ↔ invoices
facturation_complete = (clients_elig.join(
        invoices.filter(F.col("billing_period") == mois_courant),
        on=["customer_id", "billing_period"],
        how="left")
    # 4) remplacer les NULL par 0
    .fillna({
        "voice_total":          0.0,
        "sms_total":            0.0,
        "data_total":           0.0,
        "amount_before_tax":    0.0,
        "discount_pct":         0.05,
        "after_discount":       0.0,
        "tax":                  0.0,
        "amount_due":           0.0
    })
)

facturation_complete.show()


+------------+--------------+--------------------+------------+-----------+---------+-----------------+-----------+---------+----------+-----------------+------------+-----------------+------------------+-----------------+
| customer_id|billing_period|              region|rate_plan_id|voice_total|sms_total|       data_total|voice_count|sms_count|data_count|amount_before_tax|discount_pct|   after_discount|               tax|       amount_due|
+------------+--------------+--------------------+------------+-----------+---------+-----------------+-----------+---------+----------+-----------------+------------+-----------------+------------------+-----------------+
|212621008730|       2025-05|Béni Mellal-Khénifra|      PLAN_B|        0.0|      0.0|6.863119999999999|          0|        0|         1|6.863119999999999|        0.05|6.519963999999999|1.3039927999999998|7.823956799999999|
|212680058037|       2025-05|  Rabat-Salé-Kénitra|      PLAN_A|        0.0|      0.0|              0.0|     

In [8]:
facturation_complete.write.mode("overwrite").partitionBy("billing_period").parquet("billing")


In [9]:
(facturation_complete.write
 .format("jdbc")
 .option("url",      "jdbc:postgresql://localhost:5432/projet_spark")
 .option("dbtable",  "facturation_complete")
 .option("user",     "postgres")
 .option("password", "0000")
 .mode("overwrite")          
 .save())


In [10]:
facturation_complete.show()

+------------+--------------+--------------------+------------+-----------+---------+-----------------+-----------+---------+----------+-----------------+------------+-----------------+------------------+-----------------+
| customer_id|billing_period|              region|rate_plan_id|voice_total|sms_total|       data_total|voice_count|sms_count|data_count|amount_before_tax|discount_pct|   after_discount|               tax|       amount_due|
+------------+--------------+--------------------+------------+-----------+---------+-----------------+-----------+---------+----------+-----------------+------------+-----------------+------------------+-----------------+
|212621008730|       2025-05|Béni Mellal-Khénifra|      PLAN_B|        0.0|      0.0|6.863119999999999|          0|        0|         1|6.863119999999999|        0.05|6.519963999999999|1.3039927999999998|7.823956799999999|
|212680058037|       2025-05|  Rabat-Salé-Kénitra|      PLAN_A|        0.0|      0.0|              0.0|     