In [41]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("BillingEngine")
         .config("spark.jars",         r".\conf\postgresql-42.7.6.jar")
         .config("spark.driver.extraClassPath", r".\conf\postgresql-42.7.6.jar")
         .getOrCreate())

In [42]:
rated = spark.read.parquet("rated_cdrs/")
rated.printSchema()


root
 |-- customer_id: string (nullable = true)
 |-- rate_plan_id: string (nullable = true)
 |-- product_code: string (nullable = true)
 |-- record_ID: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- cell_id: string (nullable = true)
 |-- technology: string (nullable = true)
 |-- caller_id: string (nullable = true)
 |-- callee_id: string (nullable = true)
 |-- duration_sec: integer (nullable = true)
 |-- rating_status: string (nullable = true)
 |-- sender_id: string (nullable = true)
 |-- receiver_id: string (nullable = true)
 |-- receiver_cc: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- session_duration_sec: string (nullable = true)
 |-- data_volume_mb: string (nullable = true)
 |-- batch_id: integer (nullable = true)
 |-- record_type: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- free_units:

In [43]:
rated_ok = rated.filter(rated.rating_status == "rated")

In [44]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, lit

invoices = (rated_ok
    .groupBy("customer_id", "billing_period")
    .agg(
        # sous-totaux
        F.sum(when(col("record_type") == "voice", col("cost"))
              .otherwise(0)).alias("voice_total"),

        F.sum(when(col("record_type") == "sms",  col("cost"))
              .otherwise(0)).alias("sms_total"),

        F.sum(when(col("record_type") == "data", col("cost"))
              .otherwise(0)).alias("data_total"),

        # total hors taxes
        F.sum(col("cost")).alias("amount_before_tax")
    )
)


In [45]:
TVA = 0.20
invoices = (invoices
    .withColumn("discount_pct", lit(0.05))
    .withColumn("after_discount",
                col("amount_before_tax") * (1 - col("discount_pct")))
    .withColumn("tax", col("after_discount") * lit(TVA))
    .withColumn("amount_due", col("after_discount") + col("tax"))
)

invoices.show(truncate=False)


+------------+--------------+-----------+---------+------------------+------------------+------------+------------------+--------------------+--------------------+
|customer_id |billing_period|voice_total|sms_total|data_total        |amount_before_tax |discount_pct|after_discount    |tax                 |amount_due          |
+------------+--------------+-----------+---------+------------------+------------------+------------+------------------+--------------------+--------------------+
|212667966927|2025-06       |0.0        |0.0      |21.24034          |21.24034          |0.05        |20.178323         |4.0356646           |24.2139876          |
|212634834832|2025-06       |0.0        |0.0      |12.544200000000002|12.544200000000002|0.05        |11.916990000000002|2.3833980000000006  |14.300388000000002  |
|212747219893|2025-06       |0.048      |0.0      |0.0               |0.048             |0.05        |0.0456            |0.009120000000000001|0.054720000000000005|
|212687695748|20

In [46]:
customersDf = spark.read \
    .format("jdbc")\
    .option("url",      "jdbc:postgresql://localhost:5432/projet_spark")\
    .option("dbtable",  "customers")\
    .option("user",     "postgres")\
    .option("password", "0000")\
    .load() 

customersDf.show(5)

+------------+-----------------+-----------------+------------+---------------+------+--------------------+
| customer_id|    customer_name|subscription_type|rate_plan_id|activation_date|status|              region|
+------------+-----------------+-----------------+------------+---------------+------+--------------------+
|212768116861|    Anwar Kettani|         postpaid|      PLAN_C|     2024-04-29|active|Béni Mellal-Khénifra|
|212760204113|Naoual El Othmani|         postpaid|      PLAN_A|     2024-12-07|active|      Drâa-Tafilalet|
|212704344822|     Ehab Ouchrif|         postpaid|      PLAN_A|     2023-07-29|active|Laâyoune-Sakia El...|
|212658901536|     Selma Msaddi|         postpaid|      PLAN_C|     2023-06-16|active|         Souss-Massa|
|212699900093| Brahim El Hilali|         postpaid|      PLAN_A|     2024-07-27|active|Dakhla-Oued Ed Dahab|
+------------+-----------------+-----------------+------------+---------------+------+--------------------+
only showing top 5 rows



In [47]:
from pyspark.sql import functions as F

# 1) liste des mois à facturer (ex. juin 2025)
mois_courant = "2025-06"              # ou boucle sur plusieurs périodes

# 2) sous-ensemble des clients éligibles
clients_elig = (customersDf
    .filter( (F.col("status") == "active") &
             (F.col("subscription_type") == "postpaid") )
    .select("customer_id","region","rate_plan_id")
    .withColumn("billing_period", F.lit(mois_courant))
)

# 3) jointure gauche clients ↔ invoices
facturation_complete = (clients_elig.join(
        invoices.filter(F.col("billing_period") == mois_courant),
        on=["customer_id", "billing_period"],
        how="left")
    # 4) remplacer les NULL par 0
    .fillna({
        "voice_total":          0.0,
        "sms_total":            0.0,
        "data_total":           0.0,
        "amount_before_tax":    0.0,
        "discount_pct":         0.05,
        "after_discount":       0.0,
        "tax":                  0.0,
        "amount_due":           0.0
    })
)

facturation_complete.show()


+------------+--------------+--------------------+------------+-----------+---------+------------------+------------------+------------+-------------------+--------------------+--------------------+
| customer_id|billing_period|              region|rate_plan_id|voice_total|sms_total|        data_total| amount_before_tax|discount_pct|     after_discount|                 tax|          amount_due|
+------------+--------------+--------------------+------------+-----------+---------+------------------+------------------+------------+-------------------+--------------------+--------------------+
|212768116861|       2025-06|Béni Mellal-Khénifra|      PLAN_C|        0.0|      0.0|33.636779999999995|33.636779999999995|        0.05| 31.954940999999994|   6.390988199999999|   38.34592919999999|
|212760204113|       2025-06|      Drâa-Tafilalet|      PLAN_A|        0.0|      0.0|           40.4347|           40.4347|        0.05|          38.412965|   7.682593000000001|           46.095558|
|2127

In [48]:
facturation_complete.write.mode("overwrite").partitionBy("billing_period").parquet("billing/facturation_complete/")


In [49]:
(facturation_complete.write
 .format("jdbc")
 .option("url",      "jdbc:postgresql://localhost:5432/projet_spark")
 .option("dbtable",  "facturation_complete")
 .option("user",     "postgres")
 .option("password", "0000")
 .mode("overwrite")          
 .save())


In [50]:
facturation_complete.show()

+------------+--------------+--------------------+------------+-----------+---------+------------------+------------------+------------+-------------------+--------------------+--------------------+
| customer_id|billing_period|              region|rate_plan_id|voice_total|sms_total|        data_total| amount_before_tax|discount_pct|     after_discount|                 tax|          amount_due|
+------------+--------------+--------------------+------------+-----------+---------+------------------+------------------+------------+-------------------+--------------------+--------------------+
|212768116861|       2025-06|Béni Mellal-Khénifra|      PLAN_C|        0.0|      0.0|33.636779999999995|33.636779999999995|        0.05| 31.954940999999994|   6.390988199999999|   38.34592919999999|
|212760204113|       2025-06|      Drâa-Tafilalet|      PLAN_A|        0.0|      0.0|           40.4347|           40.4347|        0.05|          38.412965|   7.682593000000001|           46.095558|
|2127