In [4]:
from config import hdfs_config
print()





In [5]:
HDFS_HOST = hdfs_config.HDFSConfig.HOST
HDFS_PORT = hdfs_config.HDFSConfig.PORT
LOCAL_RAW_DATA_PATH = hdfs_config.HDFSConfig.LOCAL_RAW_DATA_PATH
HDFS_RAW_DEST_PATH = hdfs_config.HDFSConfig.RAW_DEST_PATH

In [4]:
import os
import pyhdfs
import logging
from config import hdfs_config

HDFS_HOST = 'localhost'  # Replace with your HDFS host
HDFS_PORT = 9870         # Default WebHDFS port
HDFS_RAW_DEST_PATH = '/user/hadoop/data/raw'  # Destination path in HDFS
HDFS_CLEAN_DEST_PATH = '/user/hadoop/data/cleaned'  # Destination path in HDFS
LOCAL_RAW_DATA_PATH = '/home/freddy/Documents/Cours_efrei/Data_integration/Projets_data_integration/data_integration_student_loans/data/raw'


def upload_files_to_hdfs(local_path, hdfs_path, hdfs_client):
    """Uploads files from a local directory to HDFS with detailed error handling."""
    try:
        # Ensure the HDFS directory exists
        if not hdfs_client.exists(hdfs_path):
            hdfs_client.mkdirs(hdfs_path)
            logging.info(f"Created HDFS directory: {hdfs_path}")
        
        # Check if the local directory exists
        if not os.path.exists(local_path):
            logging.error(f"Local directory {local_path} does not exist.")
            return

        # Iterate through files in the local raw data directory
        for file_name in os.listdir(local_path):
            local_file_path = os.path.join(local_path, file_name)

            # Only process files
            if os.path.isfile(local_file_path):
                hdfs_file_path = f"{hdfs_path}/{file_name}"

                # Check if the file already exists in HDFS
                if hdfs_client.exists(hdfs_file_path):
                    logging.info(f"File {hdfs_file_path} already exists in HDFS. Skipping upload.")
                    continue

                # Attempt to upload file to HDFS
                try:
                    with open(local_file_path, 'rb') as file_data:
                        hdfs_client.create(hdfs_file_path, file_data)
                    logging.info(f"Uploaded {file_name} to HDFS at {hdfs_file_path}")

                    # Verify upload success
                    if hdfs_client.exists(hdfs_file_path):
                        logging.info(f"Successfully uploaded: {file_name}")
                    else:
                        logging.error(f"Upload verification failed: {file_name}")
                except Exception as e:
                    logging.error(f"Failed to upload {file_name} to HDFS due to {e}")

            else:
                logging.warning(f"{local_file_path} is not a file. Skipping.")

    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

In [5]:
try:
    # Initialize HDFS client
    hdfs_client = pyhdfs.HdfsClient(hosts=f"{HDFS_HOST}:{HDFS_PORT}")
    logging.info("HDFS client initialized")
    
    # Start uploading files
    upload_files_to_hdfs(LOCAL_RAW_DATA_PATH, HDFS_RAW_DEST_PATH, hdfs_client)
    
except pyhdfs.HdfsException as he:
    logging.error(f"HDFS error: Failed to initialize HDFS client or perform operations due to {he}")
except Exception as e:
    logging.error(f"Unexpected error: {e}")


In [11]:
from hdfs import InsecureClient
import pandas as pd
from io import BytesIO

def read_files_from_hdfs(hdfs_directory_path, hdfs_host, hdfs_port, nrows=None):
    """
    Reads each file in the specified HDFS directory into a separate DataFrame.
    
    Parameters:
        hdfs_directory_path (str): HDFS directory path containing files to read.
        hdfs_host (str): HDFS host address.
        hdfs_port (int): HDFS port number.
        nrows (int, optional): Number of rows to read from each file (for memory efficiency).
    
    Returns:
        dict: A dictionary with file names as keys and DataFrames as values.
    """
    client = InsecureClient(f'http://{hdfs_host}:{hdfs_port}')
    data_frames = {}
    
    try:
        # List all files in the specified HDFS directory
        files = client.list(hdfs_directory_path)
        
        for file_name in files:
            file_path = f"{hdfs_directory_path}/{file_name}"
            
            # Read file content from HDFS
            with client.read(file_path) as f:
                file_content = f.read()
            
            # Use BytesIO for compatibility with pandas
            file_data = BytesIO(file_content)
            
            # Load the data into a DataFrame and store it in the dictionary
            data_frames[file_name] = pd.read_excel(file_data, nrows=nrows)
            print(f"Loaded file '{file_name}' into a DataFrame.")
        
        return data_frames
    
    except Exception as e:
        print(f"Error reading files from HDFS: {e}")
        return None

# Example usage
data_frames = read_files_from_hdfs(HDFS_RAW_DEST_PATH, HDFS_HOST, HDFS_PORT, nrows=None)


Loaded file 'FL_Dashboard_AY2009_2010_Q1.xlsx' into a DataFrame.
Loaded file 'FL_Dashboard_AY2009_2010_Q2.xlsx' into a DataFrame.
Loaded file 'FL_Dashboard_AY2009_2010_Q3.xls' into a DataFrame.
Loaded file 'FL_Dashboard_AY2009_2010_Q4.xls' into a DataFrame.


In [13]:
for file_name, df in data_frames.items():
    print(f"Cleaning DataFrame for file: {file_name}")
    # Perform individual cleaning steps for each DataFrame
    # Example: df = df.dropna() or any other cleaning function you wish to apply
    # After cleaning, you can save or process each df as needed

    # Example: Show the first few rows of each DataFrame
    print(df)

Cleaning DataFrame for file: FL_Dashboard_AY2009_2010_Q1.xlsx
             2009-2010 Award Year FFEL Volume by School  \
0     Award Year Quarterly Activity  (07/01/2009-09/...   
1                                    Data Run: 4/5/2012   
2                                                         
3                                                   NaN   
4                                                OPE ID   
...                                                 ...   
3820                                           00393200   
3821                                           00393300   
3822                                           00728900   
3823                                           00915700   
3824                                           00925900   

                             Unnamed: 1 Unnamed: 2 Unnamed: 3   Unnamed: 4  \
0                                   NaN        NaN        NaN          NaN   
1                                   NaN        NaN        NaN          Na

In [2]:
from config import hdfs_config
from hdfs import InsecureClient
import os
import logging
from io import BytesIO
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
import re

# HDFS Configuration
HDFS_HOST = hdfs_config.HDFSConfig.HOST
HDFS_PORT = hdfs_config.HDFSConfig.PORT
HDFS_RAW_DEST_PATH = hdfs_config.HDFSConfig.RAW_DEST_PATH
HDFS_CLEAN_DEST_PATH = hdfs_config.HDFSConfig.CLEAN_DEST_PATH

# Initialize Spark session
def initialize_spark():
    return SparkSession.builder.appName("Data Cleaning").getOrCreate()

# Extract quarter dates from the Quarterly Activity sheet
def extract_quarter_dates(file_data):
    metadata_df = pd.read_excel(file_data, sheet_name='Quarterly Activity', nrows=5)
    metadata_text = " ".join(metadata_df.astype(str).values.flatten())
    quarter_dates_match = re.search(r"\((\d{2}/\d{2}/\d{4})-(\d{2}/\d{2}/\d{4})\)", metadata_text)
    return quarter_dates_match.group(1) if quarter_dates_match else None, quarter_dates_match.group(2) if quarter_dates_match else None

# Load and convert sheets into Spark DataFrames
def load_ffel_data(file_data, spark):
    quarterly_activity_pd = pd.read_excel(file_data, sheet_name='Quarterly Activity', skiprows=5, dtype={'OPE ID': str})
    award_year_summary_pd = pd.read_excel(file_data, sheet_name='Award Year Summary', skiprows=5, dtype={'OPE ID': str})
    quarterly_activity_spark = spark.createDataFrame(quarterly_activity_pd)
    award_year_summary_spark = spark.createDataFrame(award_year_summary_pd)
    return quarterly_activity_spark, award_year_summary_spark

# Add quarter dates to the FFEL DataFrames
def add_quarter_dates(quarterly_df, award_year_df, quarter_start, quarter_end):
    quarterly_df = quarterly_df.withColumn("Quarter_Start", lit(quarter_start)).withColumn("Quarter_End", lit(quarter_end))
    award_year_df = award_year_df.withColumn("Quarter_Start", lit(None)).withColumn("Quarter_End", lit(None))
    return quarterly_df, award_year_df

def combine_dataframes(*dfs):
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = combined_df.union(df)
    return combined_df.dropDuplicates()

def rename_columns(df, column_mapping):
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns:
            df = df.withColumnRenamed(old_name, new_name)
    return df

def cast_columns_to_double(df, columns_to_cast):
    for col_name in columns_to_cast:
        if col_name in df.columns:
            df = df.withColumn(col_name, col(col_name).cast("double"))
    return df

# Combine, clean, and process the data, then save to a local Excel file
def process_ffel_data(file_data, output_path, spark):
    quarter_start, quarter_end = extract_quarter_dates(file_data)
    quarterly_activity_spark, award_year_summary_spark = load_ffel_data(file_data, spark)
    quarterly_activity_spark, award_year_summary_spark = add_quarter_dates(quarterly_activity_spark, award_year_summary_spark, quarter_start, quarter_end)
    
    # Combine the two sheets into one DataFrame
    combined_df = combine_dataframes(quarterly_activity_spark, award_year_summary_spark)
    
    # Define column mappings for renaming
    column_mapping = {
        "# of Loans Originated": "ffel_subsidized_number_of_loans_originated",
        "$ of Loans Originated": "ffel_subsidized_amount_of_loans_originated",
        # Add other mappings here...
    }
    combined_df = rename_columns(combined_df, column_mapping)
    
    # Cast columns to double where necessary
    columns_to_cast = [
        "ffel_subsidized_amount_of_loans_originated", "ffel_unsubsidized_amount_of_loans_originated"
        # Add other columns as needed
    ]
    combined_df = cast_columns_to_double(combined_df, columns_to_cast)
    
    # Save cleaned DataFrame to local Excel
    save_to_excel(combined_df, output_path)

# Save the DataFrame to Excel for later upload
def save_to_excel(df, output_path):
    df_pd = df.toPandas()
    df_pd.to_excel(output_path, index=False, float_format="%.2f")

# Main processing function for each file type
def read_and_clean_files(hdfs_directory_path, hdfs_host, hdfs_port, local_cleaned_path):
    client = InsecureClient(f'http://{hdfs_host}:{hdfs_port}')
    spark = initialize_spark()
    try:
        files = client.list(hdfs_directory_path)
        
        for file_name in files:
            file_path = f"{hdfs_directory_path}/{file_name}"
            print(f"Processing file: {file_name}")
            
            with client.read(file_path) as f:
                file_content = f.read()
            file_data = BytesIO(file_content)
            
            if "FL_Dashboard" in file_name:
                output_path = os.path.join(local_cleaned_path, f"cleaned_{file_name}")
                process_ffel_data(file_data, output_path, spark)
                
    finally:
        spark.stop()

# Upload cleaned files to HDFS
def upload_files_to_hdfs(local_path, hdfs_path, hdfs_client):
    try:
        if not hdfs_client.exists(hdfs_path):
            hdfs_client.mkdirs(hdfs_path)
        
        for file_name in os.listdir(local_path):
            local_file_path = os.path.join(local_path, file_name)
            if os.path.isfile(local_file_path):
                hdfs_file_path = f"{hdfs_path}/{file_name}"
                with open(local_file_path, 'rb') as file_data:
                    hdfs_client.create(hdfs_file_path, file_data)
                logging.info(f"Uploaded {file_name} to HDFS at {hdfs_file_path}")
    
    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

# Main execution
local_cleaned_path = "/home/freddy/Documents/Cours_efrei/Data_integration/Projets_data_integration/data_integration_student_loans/data/cleaned"
read_and_clean_files(HDFS_RAW_DEST_PATH, HDFS_HOST, HDFS_PORT, local_cleaned_path)
hdfs_client = InsecureClient(f'http://{HDFS_HOST}:{HDFS_PORT}')
upload_files_to_hdfs(local_cleaned_path, HDFS_CLEAN_DEST_PATH, hdfs_client)




Processing file: FL_Dashboard_AY2009_2010_Q1.xlsx


                                                                                

Processing file: FL_Dashboard_AY2009_2010_Q2.xlsx


24/11/16 14:29:34 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

Processing file: FL_Dashboard_AY2009_2010_Q3.xls


                                                                                

ValueError: No engine for filetype: 'xls'

In [18]:
import os
import time
import json
import pandas as pd
from kafka import KafkaProducer
from config.kafka_config import KAFKA_BROKER_URL, TOPIC_NAME_FL_DASHBOARD, BATCH_SIZE, POLL_INTERVAL

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=KAFKA_BROKER_URL,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')  # Serialize data to JSON
)

def read_data_in_batches(file_path, batch_size):
    """Reads data from file in batches."""
    """Reads data from an Excel file in batches."""
    df = pd.read_excel(file_path)
    for start in range(0, len(df), batch_size):
        yield df[start:start + batch_size].to_dict(orient='records')

def send_data_to_kafka(topic, file_path):
    """Sends data from the file to Kafka in batches."""
    for batch in read_data_in_batches(file_path, BATCH_SIZE):
        for record in batch:
            producer.send(topic, value=record)  # Send each record to Kafka
        producer.flush()  # Ensure data is sent before sleeping
        print(f"Sent batch of {len(batch)} records to Kafka topic '{topic}'")
        time.sleep(POLL_INTERVAL)

def main():
    # Path to raw data files
    raw_data_dir = os.path.join("data", "raw")
    
    # Send FL_Dashboard files
    for filename in os.listdir(raw_data_dir):
        if filename.startswith("FL_Dashboard"):
            file_path = os.path.join(raw_data_dir, filename)
            print(f"Streaming data from file: {file_path}")
            send_data_to_kafka(TOPIC_NAME_FL_DASHBOARD, file_path)

In [None]:
main()
producer.close()

Streaming data from file: data/raw/FL_Dashboard_AY2009_2010_Q1.xlsx
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Se

In [None]:
# Kafka consumer to pull data from Kafka

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from config.kafka_config import KAFKA_BROKER_URL, TOPIC_NAME_FL_DASHBOARD, TOPIC_NAME_SCHOOL_CODELIST
from config.spark_config import APP_NAME, MASTER, BATCH_DURATION
import json

def start_spark_streaming():
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName(APP_NAME) \
        .master(MASTER) \
        .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    # Read from Kafka
    df_kafka = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKER_URL) \
        .option("subscribe", f"{TOPIC_NAME_FL_DASHBOARD},{TOPIC_NAME_SCHOOL_CODELIST}") \
        .option("startingOffsets", "earliest") \
        .load()

    # Convert Kafka's binary data to string
    df_kafka = df_kafka.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

    # Define schema or processing functions as needed for incoming data
    # This example assumes JSON strings; adjust according to your data format
    schema = "your_schema_here"  # Define your schema here

    df_parsed = df_kafka.withColumn("value", from_json(col("value"), schema))

    # Write the parsed data to the console (for debugging)
    query = df_parsed.writeStream \
        .outputMode("append") \
        .format("console") \
        .start()

    # Wait for the termination signal
    query.awaitTermination()


start_spark_streaming()

localhost
