In [None]:
from config import hdfs_config

HDFS_HOST = hdfs_config.HDFSConfig.HOST
HDFS_PORT = hdfs_config.HDFSConfig.PORT
LOCAL_RAW_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_RAW_DATA_PATH
HDFS_RAW_DEST_PATH = hdfs_config.HDFSConfig.RAW_DEST_PATH
HDFS_CLEAN_DEST_PATH = hdfs_config.HDFSConfig.CLEAN_DEST_PATH
LOCAL_CLEAN_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_CLEAN_DATA_PATH

LOCAL_COMBINED_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_COMBINED_DATA_PATH
COMBINED_DEST_PATH = hdfs_config.HDFSConfig.COMBINED_DEST_PATH

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
import pandas as pd
import re
import os

def initialize_spark():
    return SparkSession.builder.appName("Data Cleaning").getOrCreate()

def extract_quarter_dates(file_path):
    try:
        metadata_df = pd.read_excel(file_path, sheet_name='Quarterly Activity', nrows=5)
        metadata_text = " ".join(metadata_df.astype(str).values.flatten())
        quarter_dates_match = re.search(r"\((\d{2}/\d{2}/\d{4})-(\d{2}/\d{2}/\d{4})\)", metadata_text)
        quarter_start = quarter_dates_match.group(1) if quarter_dates_match else None
        quarter_end = quarter_dates_match.group(2) if quarter_dates_match else None
        return quarter_start, quarter_end
    except Exception as e:
        print(f"Error extracting quarter dates: {e}")
        return None, None

def load_ffel_data(file_path, spark):
    try:
        quarterly_activity_pd = pd.read_excel(file_path, sheet_name='Quarterly Activity', skiprows=5, dtype={'OPE ID': str})
        quarterly_activity_spark = spark.createDataFrame(quarterly_activity_pd)
        return quarterly_activity_spark #, award_year_summary_spark
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        return None, None


def add_quarter_dates(quarterly_df, quarter_start, quarter_end):
    quarterly_df = quarterly_df.withColumn("Quarter_Start", lit(quarter_start)).withColumn("Quarter_End", lit(quarter_end))
    return quarterly_df

def combine_dataframes(*dfs):
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = combined_df.union(df)
    return combined_df.dropDuplicates()

def rename_columns(df, column_mapping):
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns:
            df = df.withColumnRenamed(old_name, new_name)
    return df

def cast_columns_to_double(df, columns_to_cast):
    for col_name in columns_to_cast:
        if col_name in df.columns:
            df = df.withColumn(col_name, col(col_name).cast("double"))
    return df

def save_to_excel(df, output_path):
    try:
        if output_path.endswith(".xls"):
            output_path = output_path.replace(".xls", ".xlsx")
            print(f"Converted output file to .xlsx format: {output_path}")

        df_pd = df.toPandas()
        df_pd.to_excel(output_path, index=False, float_format="%.2f")
        print(f"Saved cleaned data to {output_path}")
    except Exception as e:
        print(f"Error saving to Excel: {e}")

def process_ffel_data(file_path, output_path,spark):
    
    
    quarter_start, quarter_end = extract_quarter_dates(file_path)
    
    quarterly_activity_spark = load_ffel_data(file_path, spark)
    if not quarterly_activity_spark :
        print(f"Skipping {file_path} due to load errors.")
        return
    
    quarterly_activity_spark = add_quarter_dates(
        quarterly_activity_spark, quarter_start, quarter_end
    )

    
    combined_df = combine_dataframes(quarterly_activity_spark)
    
    column_mapping = {
        'School': 'SchoolName',
        "# of Loans Originated": "ffel_subsidized_number_of_loans_originated",
        "$ of Loans Originated": "ffel_subsidized_amount_of_loans_originated",
        "Recipients": "ffel_subsidized_recipients",
        "# of Loans Originated.1": "ffel_unsubsidized_number_of_loans_originated",
        "$ of Loans Originated.1": "ffel_unsubsidized_amount_of_loans_originated",
        "Recipients.1": "ffel_unsubsidized_recipients",
        "# of Loans Originated.2": "ffel_stafford_number_of_loans_originated",
        "$ of Loans Originated.2": "ffel_stafford_amount_of_loans_originated",
        "Recipients.2": "ffel_stafford_recipients",
        "# of Loans Originated.3": "ffel_plus_number_of_loans_originated",
        "$ of Loans Originated.3": "ffel_plus_amount_of_loans_originated",
        "Recipients.3": "ffel_plus_recipients",
        "# of Disbursements": "ffel_subsidized_number_of_disbursements",
        "$ of Disbursements": "ffel_subsidized_amount_of_disbursements",
        "# of Disbursements.1": "ffel_unsubsidized_number_of_disbursements",
        "$ of Disbursements.1": "ffel_unsubsidized_amount_of_disbursements",
        "# of Disbursements.2": "ffel_stafford_number_of_disbursements",
        "$ of Disbursements.2": "ffel_stafford_amount_of_disbursements",
        "# of Disbursements.3": "ffel_plus_number_of_disbursements",
        "$ of Disbursements.3": "ffel_plus_amount_of_disbursements"
    }
    

    combined_df = rename_columns(combined_df, column_mapping)
    
    columns_to_cast = [
        "ffel_subsidized_amount_of_loans_originated", "ffel_unsubsidized_amount_of_loans_originated",
        "ffel_stafford_amount_of_loans_originated", "ffel_plus_amount_of_loans_originated",
        "ffel_subsidized_amount_of_disbursements", "ffel_unsubsidized_amount_of_disbursements",
        "ffel_stafford_amount_of_disbursements", "ffel_plus_amount_of_disbursements"
    ]
    
    combined_df = cast_columns_to_double(combined_df, columns_to_cast)
    
    save_to_excel(combined_df, output_path)
    
    

def process_all_files(raw_dir, cleaned_dir):
    spark = initialize_spark()
    
    try:
        if not os.path.exists(cleaned_dir):
            os.makedirs(cleaned_dir)
        
        raw_files = [f for f in os.listdir(raw_dir) if f.endswith(".xls") or f.endswith(".xlsx")]
        
        for file_name in raw_files:
            raw_file_path = os.path.join(raw_dir, file_name)
            cleaned_file_path = os.path.join(cleaned_dir, f"cleaned_{file_name}")
            
            try:
                process_ffel_data(raw_file_path, cleaned_file_path,spark)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    finally:
        spark.stop()



def load_fed_school_code_data(file_path, spark):
    fed_school_code_pd = pd.read_excel(file_path)
    fed_school_code_spark = spark.createDataFrame(fed_school_code_pd)
    
    return fed_school_code_spark


def process_fed_school_code_data(file_path, output_path):
    spark = initialize_spark()
    try:
        fed_school_code_spark = load_fed_school_code_data(file_path, spark)
        
        save_to_excel(fed_school_code_spark, output_path)
    
    finally:
        spark.stop()


file_path = '/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/raw'
output_path ='/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned'

process_all_files(file_path, output_path)