In [None]:
import os
import pyhdfs
import logging
from config import hdfs_config


logging.basicConfig(level=logging.INFO)

HDFS_HOST = hdfs_config.HDFSConfig.HOST
HDFS_PORT = hdfs_config.HDFSConfig.PORT
LOCAL_RAW_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_RAW_DATA_PATH
HDFS_RAW_DEST_PATH = hdfs_config.HDFSConfig.RAW_DEST_PATH
LOCAL_CLEAN_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_CLEAN_DATA_PATH
HDFS_CLEAN_DEST_PATH = hdfs_config.HDFSConfig.CLEAN_DEST_PATH

LOCAL_COMBINED_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_COMBINED_DATA_PATH
COMBINED_DEST_PATH = hdfs_config.HDFSConfig.COMBINED_DEST_PATH

def upload_files_to_hdfs(local_path, hdfs_path, hdfs_client):
    try:
        if not hdfs_client.exists(hdfs_path):
            hdfs_client.mkdirs(hdfs_path)
            logging.info(f"Created HDFS directory: {hdfs_path}")
        
        if not os.path.exists(local_path):
            logging.error(f"Local directory {local_path} does not exist.")
            return

        for file_name in os.listdir(local_path):
            local_file_path = os.path.join(local_path, file_name)

            if os.path.isfile(local_file_path):
                hdfs_file_path = f"{hdfs_path}/{file_name}"

                if hdfs_client.exists(hdfs_file_path):
                    logging.info(f"File {hdfs_file_path} already exists in HDFS. Deleting before upload.")
                    hdfs_client.delete(hdfs_file_path)

                try:
                    with open(local_file_path, 'rb') as file_data:
                        hdfs_client.create(hdfs_file_path, file_data)
                    logging.info(f"Uploaded {file_name} to HDFS at {hdfs_file_path}")

                    if hdfs_client.exists(hdfs_file_path):
                        logging.info(f"Successfully uploaded: {file_name}")
                    else:
                        logging.error(f"Upload verification failed: {file_name}")
                except Exception as e:
                    logging.error(f"Failed to upload {file_name} to HDFS due to {e}")

            else:
                logging.warning(f"{local_file_path} is not a file. Skipping.")

    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

In [None]:
import os
from datetime import datetime
import pandas as pd

def combine_excel_files_parquet(folder_path, output_directory, output_filename="combined_data.parquet"):
    
    combined_df = pd.DataFrame()
    
    excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xls') or f.endswith('.xlsx')]
    
    if not excel_files:
        print(f"Aucun fichier Excel trouvé dans le dossier : {folder_path}")
        return
    
    current_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    for file_name in excel_files:
        file_path = os.path.join(folder_path, file_name)
        try:
            df = pd.read_excel(file_path, dtype={'OPE ID': str})
            df['OPE ID'] = df['OPE ID'].str[:-2]
            df['Timestamp'] = current_timestamp
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            print(f"Fichier lu et combiné : {file_name}")
        except Exception as e:
            print(f"Échec de la lecture du fichier {file_name} : {e}")
    
    combined_df.drop_duplicates(inplace=True)
    
    os.makedirs(output_directory, exist_ok=True)
    output_file_path = os.path.join(output_directory, output_filename)

    try:
        combined_df.to_parquet(output_file_path, index=False)
        print(f"Données combinées enregistrées dans le fichier Parquet : {output_file_path}")
    except Exception as e:
        print(f"Échec de l'enregistrement des données combinées dans le fichier {output_file_path} : {e}")


In [None]:
folder_path = "/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/cleaned"
output_directory_parquet = "/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/parquet"
combine_excel_files_parquet(folder_path, output_directory_parquet)

In [None]:
local_parquet = "/Users/anthonycormeaux/Documents/Projet_data_integration/Nouvelle version/data_integration_student_loans/data/parquet"
hdfs_path = "/user/anthonycormeaux/data/dfparquet"

try:
    hdfs_client = pyhdfs.HdfsClient(hosts=f"{HDFS_HOST}:{HDFS_PORT}", user_name='anthonycormeaux')
    logging.info("HDFS client initialized")

    upload_files_to_hdfs(local_parquet, hdfs_path, hdfs_client)
    
except pyhdfs.HdfsException as he:
    logging.error(f"HDFS error: Failed to initialize HDFS client or perform operations due to {he}")
except Exception as e:
    logging.error(f"Unexpected error: {e}")