In [1]:
from pyspark.sql import functions as F

def calculate_total_loans_per_school(df):
    total_loans = df.groupBy("SchoolName").agg(
        F.sum("ffel_subsidized_amount_of_loans_originated").alias("Total_Subsidized"),
        F.sum("ffel_unsubsidized_amount_of_loans_originated").alias("Total_Unsubsidized"),
        F.sum("ffel_stafford_amount_of_loans_originated").alias("Total_Stafford"),
        F.sum("ffel_plus_amount_of_loans_originated").alias("Total_PLUS")
    )
    total_loans = total_loans.withColumn(
        "Total_Loans",
        F.expr("Total_Subsidized + Total_Unsubsidized + Total_Stafford + Total_PLUS")
    )
    return total_loans


In [2]:
def calculate_total_loans_per_state(df):
    total_loans_state = df.groupBy("State").agg(
        F.sum("ffel_subsidized_amount_of_loans_originated").alias("Total_Subsidized"),
        F.sum("ffel_unsubsidized_amount_of_loans_originated").alias("Total_Unsubsidized"),
        F.sum("ffel_stafford_amount_of_loans_originated").alias("Total_Stafford"),
        F.sum("ffel_plus_amount_of_loans_originated").alias("Total_PLUS")
    )
    total_loans_state = total_loans_state.withColumn(
        "Total_Loans",
        F.expr("Total_Subsidized + Total_Unsubsidized + Total_Stafford + Total_PLUS")
    )
    return total_loans_state


In [3]:
def calculate_loan_evolution(df):
    loan_evolution = df.groupBy("Quarter_Start", "Quarter_End").agg(
        F.sum("ffel_subsidized_amount_of_loans_originated").alias("Total_Subsidized"),
        F.sum("ffel_unsubsidized_amount_of_loans_originated").alias("Total_Unsubsidized"),
        F.sum("ffel_stafford_amount_of_loans_originated").alias("Total_Stafford"),
        F.sum("ffel_plus_amount_of_loans_originated").alias("Total_PLUS")
    )
    loan_evolution = loan_evolution.withColumn(
        "Total_Loans",
        F.expr("Total_Subsidized + Total_Unsubsidized + Total_Stafford + Total_PLUS")
    )
    return loan_evolution.orderBy("Quarter_Start")


In [4]:
import datetime
from pyspark.sql import SparkSession

def save_metrics(df, path, source_path, backup_dir,partition_by=None):
    current_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    backup_path = f"{backup_dir}/dfparquet_backup_{current_timestamp}"

    spark = SparkSession.builder.appName("DataBackup").getOrCreate()
    spark.conf.set("spark.sql.files.ignoreMissingFiles", "true")

    # Utiliser l'API Hadoop FileSystem pour copier le fichier
    hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()

    # Importer URI depuis Java
    URI = spark._jvm.java.net.URI

    # Obtenir le FileSystem HDFS en spécifiant l'URI
    hdfs_uri = URI("hdfs://localhost:9080")
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hdfs_uri, hadoop_conf)

    # Créer les objets Path
    source = spark._jvm.org.apache.hadoop.fs.Path(source_path)
    destination = spark._jvm.org.apache.hadoop.fs.Path(backup_path)
    backup_dir_path = spark._jvm.org.apache.hadoop.fs.Path(backup_dir)

    # Vérifier si le répertoire de sauvegarde existe, sinon le créer
    if not fs.exists(backup_dir_path):
        fs.mkdirs(backup_dir_path)

    # Copier le fichier au sein de HDFS
    FileUtil = spark._jvm.org.apache.hadoop.fs.FileUtil
    FileUtil.copy(fs, source, fs, destination, False, hadoop_conf)
    
    if partition_by:
        df.write.mode("overwrite").partitionBy(partition_by).parquet(path)
    else:
        df.write.mode("overwrite").parquet(path)


In [5]:
from pyspark.sql import SparkSession, functions as F

def main():
    spark = SparkSession.builder.appName("CalculateMetrics").getOrCreate()

    # Lire le DataFrame depuis HDFS
    hdfs_parquet_path = "hdfs://localhost:9080/user/anthonycormeaux/data/result/joined_data"
    df = spark.read.parquet(hdfs_parquet_path)

    # Calculer les métriques
    total_loans_per_school = calculate_total_loans_per_school(df)
    total_loans_per_state = calculate_total_loans_per_state(df)
    loan_evolution = calculate_loan_evolution(df)

    print(total_loans_per_school.show(5))

    # Stocker les résultats
    save_metrics(total_loans_per_school, path="hdfs://localhost:9080/user/anthonycormeaux/data/total_loans_per_school", source_path="hdfs://localhost:9080/user/anthonycormeaux/data/total_loans_per_school", backup_dir="hdfs://localhost:9080/user/anthonycormeaux/data/metrics_backup/total_loans_per_school")
    save_metrics(total_loans_per_state, path="hdfs://localhost:9080/user/anthonycormeaux/data/total_loans_per_state", source_path="hdfs://localhost:9080/user/anthonycormeaux/data/total_loans_per_state", backup_dir="hdfs://localhost:9080/user/anthonycormeaux/data/metrics_backup/total_loans_per_state")
    save_metrics(loan_evolution, path="hdfs://localhost:9080/user/anthonycormeaux/data/loan_evolution", source_path="hdfs://localhost:9080/user/anthonycormeaux/data/loan_evolution", backup_dir="hdfs://localhost:9080/user/anthonycormeaux/data/metrics_backup/loan_evolution")

    spark.stop()

if __name__ == "__main__":
    main()


24/11/22 18:04:19 WARN Utils: Your hostname, MacBook-Air-de-Anthony.local resolves to a loopback address: 127.0.0.1; using 192.168.1.30 instead (on interface en0)
24/11/22 18:04:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/22 18:04:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+--------------------+----------------+------------------+--------------+----------+-----------+
|          SchoolName|Total_Subsidized|Total_Unsubsidized|Total_Stafford|Total_PLUS|Total_Loans|
+--------------------+----------------+------------------+--------------+----------+-----------+
|   HUMPHREYS COLLEGE|         5145756|           7490527|        188187|     37852|   12862322|
|UNIVERSITY OF ALA...|         8637357|          11716884|        325886|     75620|   20755747|
|UNIVERSITY OF OXF...|           17000|             23500|         39924|     28300|     108724|
|UNIVERSITY OF NEW...|           94610|            113640|         87803|    116011|     412064|
|ANCILLA DOMINI CO...|         1588504|           1883644|        222711|         0|    3694859|
+--------------------+----------------+------------------+--------------+----------+-----------+
only showing top 5 rows

None


24/11/22 18:04:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
