In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("Reporting")
         .getOrCreate())

In [2]:
from pyspark.sql import functions as F
rated = spark.read.parquet("rated_cdrs/")
# 1) Top 10 data consumers (sur le mois courant)
top10 = (rated
         .filter("record_type = 'data' AND rating_status IN ('rated','imputed')")
         .groupBy("customer_id")
         .agg(F.sum("billable_units").alias("total_MB"),
              F.sum("cost").alias("amount_due"))
         .orderBy(F.desc("total_MB"))
         .limit(10))

top10.write.mode("overwrite").option("header", True).csv("report/top10_consumers.csv")

In [3]:
# 2) Revenus TTC par plan tarifaire (
facturation_complete = spark.read.parquet("billing/")
rev_plan = (facturation_complete
            .groupBy("rate_plan_id")
            .agg(F.sum("amount_due").alias("revenue_TTC"))
            .orderBy(F.desc("revenue_TTC")))

rev_plan.write.mode("overwrite").option("header", True).csv("report/revenue_by_plan.csv")


In [4]:
#3) revenu de chaque record_type par region
# a) rattacher la région au rated_cdrs
rated_reg = rated.join(
    facturation_complete.select("customer_id", "region"), "customer_id", "left"
)

# b) agréger le revenu par service et région
mix_region = (rated_reg
    .filter(rated_reg.rating_status=="rated")
    .groupBy("region", "record_type")
    .agg(F.sum("cost").alias("revenue"))
    .orderBy("region")
)

mix_region.write.mode("overwrite")\
    .option("header", True)\
    .csv("report/mix_by_region.csv")

In [5]:
# 4)revenu de chaque plan par region
plan_region = (facturation_complete
    .groupBy("region", "rate_plan_id")
    .agg(F.sum("amount_due").alias("revenue_TTC"))
    .orderBy("region", F.desc("revenue_TTC"))
)

plan_region.write.mode("overwrite")\
    .option("header", True)\
    .csv("report/plan_by_region.csv")


In [6]:
from pyspark.sql import functions as F

daily_revenue = (
    rated
    .filter(F.col("rating_status").isin("rated", "imputed"))
    .withColumn("billing_date", F.to_date("timestamp"))          # yyyy-MM-dd
    .groupBy("billing_date")
    .agg(F.round(F.sum("cost"), 4).alias("revenue_TTC"))         # montant total TTC
    .orderBy("billing_date")
)

daily_revenue.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("report/daily_revenue.csv")


In [7]:
traffic_hourly = (
    rated
    .filter( (F.col("record_type") == "data")
           & F.col("rating_status").isin("rated", "imputed") )
    .withColumn("billing_date", F.to_date("timestamp"))          # pour filtre éventuel
    .withColumn("hour", F.hour("timestamp"))                     # 0-23
    .withColumn("GB", F.col("billable_units") / 1024)            # MB → GB
    .groupBy("billing_date", "hour")
    .agg(F.round(F.sum("GB"), 3).alias("total_GB"))
    .orderBy("billing_date", "hour")
)

traffic_hourly.write \
    .mode("overwrite") \
    .option("header", True) \
    .csv("report/traffic_hourly.csv")
