In [None]:
from pyspark.sql import functions as F

def calculate_total_loans_per_school(df):
    total_loans = df.groupBy("SchoolName").agg(
        F.sum("ffel_subsidized_amount_of_loans_originated").alias("Total_Subsidized"),
        F.sum("ffel_unsubsidized_amount_of_loans_originated").alias("Total_Unsubsidized"),
        F.sum("ffel_stafford_amount_of_loans_originated").alias("Total_Stafford"),
        F.sum("ffel_plus_amount_of_loans_originated").alias("Total_PLUS")
    )
    total_loans = total_loans.withColumn(
        "Total_Loans",
        F.expr("Total_Subsidized + Total_Unsubsidized + Total_Stafford + Total_PLUS")
    )
    return total_loans


In [None]:
def calculate_total_loans_per_state(df):
    total_loans_state = df.groupBy("State").agg(
        F.sum("ffel_subsidized_amount_of_loans_originated").alias("Total_Subsidized"),
        F.sum("ffel_unsubsidized_amount_of_loans_originated").alias("Total_Unsubsidized"),
        F.sum("ffel_stafford_amount_of_loans_originated").alias("Total_Stafford"),
        F.sum("ffel_plus_amount_of_loans_originated").alias("Total_PLUS")
    )
    total_loans_state = total_loans_state.withColumn(
        "Total_Loans",
        F.expr("Total_Subsidized + Total_Unsubsidized + Total_Stafford + Total_PLUS")
    )
    return total_loans_state


In [None]:
def calculate_loan_evolution(df):
    loan_evolution = df.groupBy("Quarter_Start", "Quarter_End").agg(
        F.sum("ffel_subsidized_amount_of_loans_originated").alias("Total_Subsidized"),
        F.sum("ffel_unsubsidized_amount_of_loans_originated").alias("Total_Unsubsidized"),
        F.sum("ffel_stafford_amount_of_loans_originated").alias("Total_Stafford"),
        F.sum("ffel_plus_amount_of_loans_originated").alias("Total_PLUS")
    )
    loan_evolution = loan_evolution.withColumn(
        "Total_Loans",
        F.expr("Total_Subsidized + Total_Unsubsidized + Total_Stafford + Total_PLUS")
    )
    return loan_evolution.orderBy("Quarter_Start")


In [None]:
def save_metrics(df, path, partition_by=None):
    if partition_by:
        df.write.mode("overwrite").partitionBy(partition_by).parquet(path)
    else:
        df.write.mode("overwrite").parquet(path)


In [None]:
from pyspark.sql import SparkSession, functions as F

def main():
    spark = SparkSession.builder.appName("CalculateMetrics").getOrCreate()

    # Lire le DataFrame depuis HDFS
    hdfs_parquet_path = "hdfs://localhost:9080/user/anthonycormeaux/data/result/joined_data"
    df = spark.read.parquet(hdfs_parquet_path)

    # Calculer les métriques
    total_loans_per_school = calculate_total_loans_per_school(df)
    total_loans_per_state = calculate_total_loans_per_state(df)
    loan_evolution = calculate_loan_evolution(df)

    print(total_loans_per_school.show(5))

    # Stocker les résultats
    save_metrics(total_loans_per_school, "hdfs://localhost:9080/user/anthonycormeaux/data/total_loans_per_school")
    save_metrics(total_loans_per_state, "hdfs://localhost:9080/user/anthonycormeaux/data/total_loans_per_state")
    save_metrics(loan_evolution, "hdfs://localhost:9080/user/anthonycormeaux/data/loan_evolution")

    spark.stop()

if __name__ == "__main__":
    main()
