In [1]:
from config import hdfs_config

In [2]:
HDFS_HOST = hdfs_config.HDFSConfig.HOST
HDFS_PORT = hdfs_config.HDFSConfig.PORT
LOCAL_RAW_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_RAW_DATA_PATH
HDFS_RAW_DEST_PATH = hdfs_config.HDFSConfig.RAW_DEST_PATH
HDFS_CLEAN_DEST_PATH = hdfs_config.HDFSConfig.CLEAN_DEST_PATH
LOCAL_CLEAN_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_CLEAN_DATA_PATH

LOCAL_COMBINED_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_COMBINED_DATA_PATH
COMBINED_DEST_PATH = hdfs_config.HDFSConfig.COMBINED_DEST_PATH



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
import pandas as pd
import re
import os

# Initialize Spark session
def initialize_spark():
    return SparkSession.builder.appName("Data Cleaning").getOrCreate()

# Extract quarter start and end dates from Quarterly Activity sheet (specific to FFEL Dashboard files)
def extract_quarter_dates(file_path):
    try:
        metadata_df = pd.read_excel(file_path, sheet_name='Quarterly Activity', nrows=5)
        metadata_text = " ".join(metadata_df.astype(str).values.flatten())
        quarter_dates_match = re.search(r"\((\d{2}/\d{2}/\d{4})-(\d{2}/\d{2}/\d{4})\)", metadata_text)
        quarter_start = quarter_dates_match.group(1) if quarter_dates_match else None
        quarter_end = quarter_dates_match.group(2) if quarter_dates_match else None
        return quarter_start, quarter_end
    except Exception as e:
        print(f"Error extracting quarter dates: {e}")
        return None, None

# Load FFEL Dashboard data from Excel sheets and convert to Spark DataFrames
def load_ffel_data(file_path, spark):
    try:
        quarterly_activity_pd = pd.read_excel(file_path, sheet_name='Quarterly Activity', skiprows=5, dtype={'OPE ID': str})
        #award_year_summary_pd = pd.read_excel(file_path, sheet_name='Award Year Summary', skiprows=5, dtype={'OPE ID': str})
        quarterly_activity_spark = spark.createDataFrame(quarterly_activity_pd)
        #award_year_summary_spark = spark.createDataFrame(award_year_summary_pd)
        return quarterly_activity_spark #, award_year_summary_spark
    except Exception as e:
        print(f"Error loading data from {file_path}: {e}")
        return None, None


# Add quarter dates for FFEL data
def add_quarter_dates(quarterly_df, quarter_start, quarter_end):
    #add_quarter_dates(quarterly_df, award_year_df, quarter_start, quarter_end):
    quarterly_df = quarterly_df.withColumn("Quarter_Start", lit(quarter_start)).withColumn("Quarter_End", lit(quarter_end))
    #award_year_df = award_year_df.withColumn("Quarter_Start", lit(None)).withColumn("Quarter_End", lit(None))
    return quarterly_df#, award_year_df

# Combine DataFrames and remove duplicates
def combine_dataframes(*dfs):
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = combined_df.union(df)
    return combined_df.dropDuplicates()

# Rename columns to a standardized format
def rename_columns(df, column_mapping):
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns:
            df = df.withColumnRenamed(old_name, new_name)
    return df

# Convert specified columns to double type
def cast_columns_to_double(df, columns_to_cast):
    for col_name in columns_to_cast:
        if col_name in df.columns:
            df = df.withColumn(col_name, col(col_name).cast("double"))
    return df

# Save the cleaned DataFrame to Excel
def save_to_excel(df, output_path):
    try:
        # Ensure output is saved in `.xlsx` format for compatibility
        if output_path.endswith(".xls"):
            output_path = output_path.replace(".xls", ".xlsx")
            print(f"Converted output file to .xlsx format: {output_path}")

        df_pd = df.toPandas()
        df_pd.to_excel(output_path, index=False, float_format="%.2f")
        print(f"Saved cleaned data to {output_path}")
    except Exception as e:
        print(f"Error saving to Excel: {e}")

# Main function to process FFEL data files
def process_ffel_data(file_path, output_path,spark):
    
    
    # Extract quarter dates
    quarter_start, quarter_end = extract_quarter_dates(file_path)
    
    # Load FFEL data
    quarterly_activity_spark = load_ffel_data(file_path, spark)
    #quarterly_activity_spark, award_year_summary_spark = load_ffel_data(file_path, spark)
    if not quarterly_activity_spark :
    #if not quarterly_activity_spark or not award_year_summary_spark:
        print(f"Skipping {file_path} due to load errors.")
        return
    
    # Add quarter dates
    quarterly_activity_spark = add_quarter_dates(
        quarterly_activity_spark, quarter_start, quarter_end
    )
    # quarterly_activity_spark, award_year_summary_spark = add_quarter_dates(
    #     quarterly_activity_spark, award_year_summary_spark, quarter_start, quarter_end
    # )
    
    # Combine data
    combined_df = combine_dataframes(quarterly_activity_spark)
    
    # Column renaming mappings
    column_mapping = {
        'School': 'SchoolName',
        "# of Loans Originated": "ffel_subsidized_number_of_loans_originated",
        "$ of Loans Originated": "ffel_subsidized_amount_of_loans_originated",
        "Recipients": "ffel_subsidized_recipients",
        "# of Loans Originated.1": "ffel_unsubsidized_number_of_loans_originated",
        "$ of Loans Originated.1": "ffel_unsubsidized_amount_of_loans_originated",
        "Recipients.1": "ffel_unsubsidized_recipients",
        "# of Loans Originated.2": "ffel_stafford_number_of_loans_originated",
        "$ of Loans Originated.2": "ffel_stafford_amount_of_loans_originated",
        "Recipients.2": "ffel_stafford_recipients",
        "# of Loans Originated.3": "ffel_plus_number_of_loans_originated",
        "$ of Loans Originated.3": "ffel_plus_amount_of_loans_originated",
        "Recipients.3": "ffel_plus_recipients",
        "# of Disbursements": "ffel_subsidized_number_of_disbursements",
        "$ of Disbursements": "ffel_subsidized_amount_of_disbursements",
        "# of Disbursements.1": "ffel_unsubsidized_number_of_disbursements",
        "$ of Disbursements.1": "ffel_unsubsidized_amount_of_disbursements",
        "# of Disbursements.2": "ffel_stafford_number_of_disbursements",
        "$ of Disbursements.2": "ffel_stafford_amount_of_disbursements",
        "# of Disbursements.3": "ffel_plus_number_of_disbursements",
        "$ of Disbursements.3": "ffel_plus_amount_of_disbursements"
    }
    
    # Rename columns
    combined_df = rename_columns(combined_df, column_mapping)
    
    # Columns to cast to double
    columns_to_cast = [
        "ffel_subsidized_amount_of_loans_originated", "ffel_unsubsidized_amount_of_loans_originated",
        "ffel_stafford_amount_of_loans_originated", "ffel_plus_amount_of_loans_originated",
        "ffel_subsidized_amount_of_disbursements", "ffel_unsubsidized_amount_of_disbursements",
        "ffel_stafford_amount_of_disbursements", "ffel_plus_amount_of_disbursements"
    ]
    
    # Cast columns to double
    combined_df = cast_columns_to_double(combined_df, columns_to_cast)
    
    # Save to Excel
    save_to_excel(combined_df, output_path)
    
    



# Process all files in raw directory
def process_all_files(raw_dir, cleaned_dir):
    spark = initialize_spark()
    
    try:
        if not os.path.exists(cleaned_dir):
            os.makedirs(cleaned_dir)
        
        raw_files = [f for f in os.listdir(raw_dir) if f.endswith(".xls") or f.endswith(".xlsx")]
        
        for file_name in raw_files:
            raw_file_path = os.path.join(raw_dir, file_name)
            cleaned_file_path = os.path.join(cleaned_dir, f"cleaned_{file_name}")
            
            try:
                process_ffel_data(raw_file_path, cleaned_file_path,spark)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    finally:
        # Stop Spark session
        spark.stop()




# Load Fed School Code data and convert to Spark DataFrame
def load_fed_school_code_data(file_path, spark):
    # Customize based on the structure of the Fed School Code file
    fed_school_code_pd = pd.read_excel(file_path)  # Adjust sheet name if different
    fed_school_code_spark = spark.createDataFrame(fed_school_code_pd)
    
    return fed_school_code_spark


# Main function to process Fed School Code data files
def process_fed_school_code_data(file_path, output_path):
    spark = initialize_spark()
    try:
        # Load Fed School Code data
        fed_school_code_spark = load_fed_school_code_data(file_path, spark)
        
        # Save to Excel
        save_to_excel(fed_school_code_spark, output_path)
    
    finally:
        # Stop Spark session
        spark.stop()


file_path = '/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/raw'
output_path ='/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned'
fedschool_path ='/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/1617fedschoolcodelist.xls'
fedschooloutput_path ='/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned_FFEL_data.xlsx'
# process_ffel_data(file_path,output_path)

process_all_files(file_path, output_path)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



24/11/22 18:00:51 WARN Utils: Your hostname, MacBook-Air-de-Anthony.local resolves to a loopback address: 127.0.0.1; using 192.168.1.30 instead (on interface en0)
24/11/22 18:00:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/22 18:00:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Error extracting quarter dates: Excel file format cannot be determined, you must specify an engine manually.
Error loading data from /Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/raw/~$FL_Dashboard_AY2009_2010_Q1.xlsx: Excel file format cannot be determined, you must specify an engine manually.
Error processing ~$FL_Dashboard_AY2009_2010_Q1.xlsx: 'tuple' object has no attribute 'withColumn'


24/11/22 18:00:53 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Saved cleaned data to /Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned/cleaned_FL_Dashboard_AY2009_2010_Q1.xlsx
Saved cleaned data to /Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned/cleaned_FL_Dashboard_AY2009_2010_Q2.xlsx
Converted output file to .xlsx format: /Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned/cleaned_FL_Dashboard_AY2009_2010_Q3.xlsx
Saved cleaned data to /Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned/cleaned_FL_Dashboard_AY2009_2010_Q3.xlsx
Converted output file to .xlsx format: /Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned/cleaned_FL_Dashboard_AY2009_2010_Q4.xlsx
Saved cleaned data to /Users/anthonycormeaux/Documents/Projet

In [4]:
import os
import pyhdfs
import logging
from config import hdfs_config


logging.basicConfig(level=logging.INFO)

# Retrieve configuration
HDFS_HOST = hdfs_config.HDFSConfig.HOST
HDFS_PORT = hdfs_config.HDFSConfig.PORT
LOCAL_RAW_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_RAW_DATA_PATH
HDFS_RAW_DEST_PATH = hdfs_config.HDFSConfig.RAW_DEST_PATH
LOCAL_CLEAN_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_CLEAN_DATA_PATH
HDFS_CLEAN_DEST_PATH = hdfs_config.HDFSConfig.CLEAN_DEST_PATH

def upload_files_to_hdfs(local_path, hdfs_path, hdfs_client):
    """Uploads files from a local directory to HDFS with detailed error handling and overwriting."""
    try:
        # Ensure the HDFS directory exists
        if not hdfs_client.exists(hdfs_path):
            hdfs_client.mkdirs(hdfs_path)
            logging.info(f"Created HDFS directory: {hdfs_path}")
        
        # Check if the local directory exists
        if not os.path.exists(local_path):
            logging.error(f"Local directory {local_path} does not exist.")
            return

        # Iterate through files in the local directory
        for file_name in os.listdir(local_path):
            local_file_path = os.path.join(local_path, file_name)

            # Only process files
            if os.path.isfile(local_file_path):
                hdfs_file_path = f"{hdfs_path}/{file_name}"

                # Check if the file already exists in HDFS and remove it if it does
                if hdfs_client.exists(hdfs_file_path):
                    logging.info(f"File {hdfs_file_path} already exists in HDFS. Deleting before upload.")
                    hdfs_client.delete(hdfs_file_path)

                # Attempt to upload file to HDFS
                try:
                    with open(local_file_path, 'rb') as file_data:
                        hdfs_client.create(hdfs_file_path, file_data)
                    logging.info(f"Uploaded {file_name} to HDFS at {hdfs_file_path}")

                    # Verify upload success
                    if hdfs_client.exists(hdfs_file_path):
                        logging.info(f"Successfully uploaded: {file_name}")
                    else:
                        logging.error(f"Upload verification failed: {file_name}")
                except Exception as e:
                    logging.error(f"Failed to upload {file_name} to HDFS due to {e}")

            else:
                logging.warning(f"{local_file_path} is not a file. Skipping.")

    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

In [5]:
from datetime import datetime


def combine_excel_files(folder_path, output_directory, output_filename="combined_data.xlsx"):
    
    combined_df = pd.DataFrame()
    
    
    # Lister tous les fichiers .xls et .xlsx dans le dossier
    excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xls') or f.endswith('.xlsx')]
    
    if not excel_files:
        print(f"Aucun fichier Excel trouvé dans le dossier : {folder_path}")
        return  # Sortir de la fonction
    
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    for file_name in excel_files:
        file_path = os.path.join(folder_path, file_name)
        try:
            # Lire le fichier Excel
            df = pd.read_excel(file_path, dtype={'OPE ID': str})
            df['Timestamp'] = current_timestamp
            # Combiner avec le DataFrame existant
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            print(f"Fichier lu et combiné : {file_name}")
        except Exception as e:
            print(f"Échec de la lecture du fichier {file_name} : {e}")
    
    # Supprimer les lignes en doublon
    combined_df.drop_duplicates(inplace=True)
    
    # Créer le répertoire de sortie s'il n'existe pas
    os.makedirs(output_directory, exist_ok=True)
    output_file_path = os.path.join(output_directory, output_filename)
    
    try :
        combined_df.to_excel(output_file_path, index=False)
    except Exception as e:
        print(f"Échec de l'enregistrement des données combinées dans le fichier {output_file_path} : {e}")

In [6]:
import os
from datetime import datetime
import pandas as pd

def combine_excel_files_parquet(folder_path, output_directory, output_filename="combined_data.parquet"):
    
    combined_df = pd.DataFrame()
    
    # Lister tous les fichiers .xls et .xlsx dans le dossier
    excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xls') or f.endswith('.xlsx')]
    
    if not excel_files:
        print(f"Aucun fichier Excel trouvé dans le dossier : {folder_path}")
        return  # Sortir de la fonction
    
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    for file_name in excel_files:
        file_path = os.path.join(folder_path, file_name)
        try:
            # Lire le fichier Excel
            df = pd.read_excel(file_path, dtype={'OPE ID': str})
            df['OPE ID'] = df['OPE ID'].str[:-2]
            df['Timestamp'] = current_timestamp
            # Combiner avec le DataFrame existant
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            print(f"Fichier lu et combiné : {file_name}")
        except Exception as e:
            print(f"Échec de la lecture du fichier {file_name} : {e}")
    
    # Supprimer les lignes en doublon
    combined_df.drop_duplicates(inplace=True)
    
    # Créer le répertoire de sortie s'il n'existe pas
    os.makedirs(output_directory, exist_ok=True)
    output_file_path = os.path.join(output_directory, output_filename)
    
    # Enregistrer le DataFrame combiné dans le chemin de sortie au format Parquet
    try:
        combined_df.to_parquet(output_file_path, index=False)
        print(f"Données combinées enregistrées dans le fichier Parquet : {output_file_path}")
    except Exception as e:
        print(f"Échec de l'enregistrement des données combinées dans le fichier {output_file_path} : {e}")


In [8]:
folder_path = "/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned"
output_directory = "/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/combined"
output_directory_parquet = "/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/parquet"
combine_excel_files(folder_path, output_directory)
combine_excel_files_parquet(folder_path, output_directory_parquet)

Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q4.xlsx
Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q2.xlsx
Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q3.xlsx
Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q1.xlsx
Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q4.xlsx
Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q2.xlsx
Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q3.xlsx
Fichier lu et combiné : cleaned_FL_Dashboard_AY2009_2010_Q1.xlsx



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



Données combinées enregistrées dans le fichier Parquet : /Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/parquet/combined_data.parquet


In [9]:
local_parquet = "/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/parquet"
hdfs_path = "/user/anthonycormeaux/data/dfparquet"

try:
    # Initialize HDFS client
    hdfs_client = pyhdfs.HdfsClient(hosts=f"{HDFS_HOST}:{HDFS_PORT}", user_name='anthonycormeaux')
    logging.info("HDFS client initialized")
    
    # Start uploading files
    # upload_files_to_hdfs(LOCAL_RAW_DATA_PATH, HDFS_RAW_DEST_PATH, hdfs_client)
    # upload_files_to_hdfs(LOCAL_CLEAN_DATA_PATH, HDFS_CLEAN_DEST_PATH, hdfs_client)
    # upload_files_to_hdfs(LOCAL_COMBINED_DATA_PATH, COMBINED_DEST_PATH, hdfs_client)
    upload_files_to_hdfs(local_parquet, hdfs_path, hdfs_client)
    
except pyhdfs.HdfsException as he:
    logging.error(f"HDFS error: Failed to initialize HDFS client or perform operations due to {he}")
except Exception as e:
    logging.error(f"Unexpected error: {e}")


INFO:root:HDFS client initialized
INFO:pyhdfs:GETFILESTATUS /user/anthonycormeaux/data/dfparquet user.name=anthonycormeaux localhost:9870
INFO:pyhdfs:GETFILESTATUS /user/anthonycormeaux/data/dfparquet/combined_data.parquet user.name=anthonycormeaux localhost:9870
INFO:root:File /user/anthonycormeaux/data/dfparquet/combined_data.parquet already exists in HDFS. Deleting before upload.
INFO:pyhdfs:DELETE /user/anthonycormeaux/data/dfparquet/combined_data.parquet user.name=anthonycormeaux localhost:9870
INFO:pyhdfs:CREATE /user/anthonycormeaux/data/dfparquet/combined_data.parquet user.name=anthonycormeaux localhost:9870
INFO:root:Uploaded combined_data.parquet to HDFS at /user/anthonycormeaux/data/dfparquet/combined_data.parquet
INFO:pyhdfs:GETFILESTATUS /user/anthonycormeaux/data/dfparquet/combined_data.parquet user.name=anthonycormeaux localhost:9870
INFO:root:Successfully uploaded: combined_data.parquet


In [None]:
df = pd.read_excel(r"/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/combined/combined_data.xlsx")

df.columns

In [None]:
from hdfs import InsecureClient
import pandas as pd
from io import BytesIO

def read_files_from_hdfs(hdfs_directory_path, hdfs_host, hdfs_port, nrows=None):
    """
    Reads each file in the specified HDFS directory into a separate DataFrame.
    
    Parameters:
        hdfs_directory_path (str): HDFS directory path containing files to read.
        hdfs_host (str): HDFS host address.
        hdfs_port (int): HDFS port number.
        nrows (int, optional): Number of rows to read from each file (for memory efficiency).
    
    Returns:
        dict: A dictionary with file names as keys and DataFrames as values.
    """
    client = InsecureClient(f'http://{hdfs_host}:{hdfs_port}')
    data_frames = {}
    
    try:
        # List all files in the specified HDFS directory
        files = client.list(hdfs_directory_path)
        
        for file_name in files:
            file_path = f"{hdfs_directory_path}/{file_name}"
            
            # Read file content from HDFS
            with client.read(file_path) as f:
                file_content = f.read()
            
            # Use BytesIO for compatibility with pandas
            file_data = BytesIO(file_content)
            
            # Load the data into a DataFrame and store it in the dictionary
            data_frames[file_name] = pd.read_excel(file_data, nrows=nrows)
            print(f"Loaded file '{file_name}' into a DataFrame.")
        
        return data_frames
    
    except Exception as e:
        print(f"Error reading files from HDFS: {e}")
        return None

# Example usage
data_frames = read_files_from_hdfs(HDFS_CLEAN_DEST_PATH, HDFS_HOST, HDFS_PORT, nrows=None)


In [None]:
data_frames["cleaned_FL_Dashboard_AY2009_2010_Q3.xlsx"].columns

In [None]:
df = pd.read_excel(r"/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/1617fedschoolcodelist.xls")

df.columns

In [None]:
from kafka import KafkaProducer
import json
import time
from datetime import datetime
import pandas as pd

df = pd.read_excel(r"/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/1617fedschoolcodelist.xls")

def json_serializer(data):
    return json.dumps(data).encode('utf-8')

topic_name = 'excel_data'

nb = 1

if __name__ == '__main__':
    producer = KafkaProducer(
        bootstrap_servers=['localhost:9092'],
        value_serializer=json_serializer
    )

    for index, row in df.iterrows():
        message = {
            "SchoolCode": row["SchoolCode"], 
            "SchoolName": row["SchoolName"],
            "Address": row["Address"],
            "City": row["City"],
            "StateCode" : row["StateCode"],
            "ZipCode" : row["ZipCode"],
            "Country" : row["Country"],
            "timestamp": datetime.now().timestamp()
        }
        producer.send(topic_name, value=message)
        print(f"Message envoyé pour {row['SchoolCode']}: {message} {nb}")
        nb += 1

    producer.flush()
    producer.close()

In [None]:
#envoie de données de crédit vers kafka
from kafka import KafkaProducer
import json
import time
from datetime import datetime
import threading
import pandas as pd
import random

df = pd.read_excel("/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/combined/combined_data.xlsx")

df = df.drop_duplicates(subset='SchoolName')

colonnes = ["SchoolName", "State","Zip Code", "School Type"]

df = df[colonnes]

def json_serializer(data):
    return json.dumps(data).encode('utf-8')
    

topic_name = 'excel_data'

if __name__ == '__main__':
    producer = KafkaProducer(
        bootstrap_servers=['localhost:9092'],
        value_serializer=json_serializer
    )

    for index, row in df.iterrows():
            message = {
                    "SchoolName": row["SchoolName"], 
                    "State": row["State"],
                    "Zip Code": row["Zip Code"],
                    "School Type": row["School Type"],
                    "timestamp": datetime.now().timestamp(),
                    "ffel_subsidized_recipients" : random.randint(100,5000),
                    "ffel_subsidized_number_of_loans_originated" : random.randint(500,8000),
                    "ffel_subsidized_amount_of_loans_originated" : random.randint(100000, 999999999),
                    "ffel_subsidized_number_of_disbursements" : random.randint(1000, 16000),

                    "ffel_unsubsidized_recipients" : random.randint(100,5000),
                    "ffel_unsubsidized_number_of_loans_originated" : random.randint(500,8000),
                    "ffel_unsubsidized_amount_of_loans_originated" : random.randint(100000, 999999999),
                    "ffel_unsubsidizednumber_of_disbursements" : random.randint(1000, 16000),

                    "ffel_stafford_recipients" : random.randint(100,5000),
                    "ffel_stafford_number_of_loans_originated" : random.randint(500,8000),
                    "ffel_stafford_amount_of_loans_originated" : random.randint(100000, 999999999),
                    "ffel_stafford_of_disbursements" : random.randint(1000, 16000),

                    "fffel_plus_recipients" : random.randint(100,5000),
                    "fffel_plus_number_of_loans_originated" : random.randint(500,8000),
                    "fffel_plus_amount_of_loans_originated" : random.randint(100000, 999999999),
                    "fffel_plus_of_disbursements" : random.randint(1000, 16000),

                    "Quarter_Start" : "04/01/2010",
                    "Quarter_End" : "06/30/2010"
                }
            producer.send(topic_name, value=message)
            print(f"Message envoyé pour {row}")


    producer.flush()
    producer.close()

In [None]:
from kafka import KafkaConsumer
import json

# Configurez le consommateur pour se connecter à Kafka et consommer les messages en JSON
def save_to_hdfs(dataframe):
    # Placeholder: replace this with actual code to save data to HDFS
    print("Data saved to HDFS:", dataframe)

# Initialize Kafka consumer
consumer = KafkaConsumer(
    'excel_data',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id='my-group',
    value_deserializer=lambda x: json.loads(x.decode('utf-8')) if x else None
)



# Load initial HDFS data
hdfs_df = data_frames["cleaned_FL_Dashboard_AY2009_2010_Q3.xlsx"]

print("Waiting for messages...")
batch_data = []
batch_size = 10000

for message in consumer:
    if message.value:
        try:
            data = message.value
            batch_data.append(data)
            
            # Display incoming batch data
            print(f"Batch data received: {batch_data[:5]}")  # Display first 5 records for brevity

            # Process in batches
            if len(batch_data) >= batch_size:
                # Create a DataFrame for the batch
                new_df = pd.DataFrame(batch_data)

                # Check if SchoolCode exists, else use SchoolName as the key
                if 'SchoolCode' in hdfs_df.columns:
                    # Merge on SchoolCode if it exists in both DataFrames
                    merged_df = pd.merge(hdfs_df, new_df, on="SchoolCode", how="outer", suffixes=('', '_new'))
                else:
                    # If SchoolCode is missing, use SchoolName as the key
                    merged_df = pd.merge(hdfs_df, new_df, on="SchoolName", how="outer", suffixes=('', '_new'))

                # Update missing values with data from the new batch
                for col in new_df.columns:
                    if col not in ['SchoolCode', 'SchoolName']:  # Skip the identifier column(s)
                        col_new = f"{col}_new"
                        # Check if the new column exists in the merged DataFrame; add it if it doesn't
                        if col_new not in merged_df.columns:
                            merged_df[col_new] = None  # Add the missing column with None values

                        # Perform the update where applicable
                        merged_df[col] = merged_df.apply(
                            lambda row: row[col_new] if pd.isnull(row[col]) else row[col], axis=1
                        )

                # Drop the temporary columns used for merging
                merged_df = merged_df.drop([col + '_new' for col in new_df.columns if col not in ['SchoolCode', 'SchoolName']], axis=1)

                # Save updated data back to HDFS in bulk
                save_to_hdfs(merged_df)

                # Update the HDFS DataFrame reference
                hdfs_df = merged_df

                # Clear batch data and commit offsets
                batch_data = []
                consumer.commit()  # Commit offsets manually after processing the batch

                # Pause before the next batch
                # sleep(10)

        except json.JSONDecodeError as e:
            print(f"JSON decoding error: {e}")
            print(f"Invalid message: {message.value}")
    else:
        print("Empty or null message ignored.")
