In [None]:
import pandas as pd


In [6]:
import os
import pyhdfs
import logging
from config import hdfs_config

HDFS_HOST = 'localhost'  # Replace with your HDFS host
HDFS_PORT = 9870         # Default WebHDFS port
HDFS_RAW_DEST_PATH = '/user/hadoop/data/raw'  # Destination path in HDFS
HDFS_CLEAN_DEST_PATH = '/user/hadoop/data/cleaned'  # Destination path in HDFS
LOCAL_RAW_DATA_PATH = '/home/freddy/Documents/Cours_efrei/Data_integration/Projets_data_integration/data_integration_student_loans/data/raw'


def upload_files_to_hdfs(local_path, hdfs_path, hdfs_client):
    """Uploads files from a local directory to HDFS with detailed error handling."""
    try:
        # Ensure the HDFS directory exists
        if not hdfs_client.exists(hdfs_path):
            hdfs_client.mkdirs(hdfs_path)
            logging.info(f"Created HDFS directory: {hdfs_path}")
        
        # Check if the local directory exists
        if not os.path.exists(local_path):
            logging.error(f"Local directory {local_path} does not exist.")
            return

        # Iterate through files in the local raw data directory
        for file_name in os.listdir(local_path):
            local_file_path = os.path.join(local_path, file_name)

            # Only process files
            if os.path.isfile(local_file_path):
                hdfs_file_path = f"{hdfs_path}/{file_name}"

                # Check if the file already exists in HDFS
                if hdfs_client.exists(hdfs_file_path):
                    logging.info(f"File {hdfs_file_path} already exists in HDFS. Skipping upload.")
                    continue

                # Attempt to upload file to HDFS
                try:
                    with open(local_file_path, 'rb') as file_data:
                        hdfs_client.create(hdfs_file_path, file_data)
                    logging.info(f"Uploaded {file_name} to HDFS at {hdfs_file_path}")

                    # Verify upload success
                    if hdfs_client.exists(hdfs_file_path):
                        logging.info(f"Successfully uploaded: {file_name}")
                    else:
                        logging.error(f"Upload verification failed: {file_name}")
                except Exception as e:
                    logging.error(f"Failed to upload {file_name} to HDFS due to {e}")

            else:
                logging.warning(f"{local_file_path} is not a file. Skipping.")

    except Exception as e:
        logging.error(f"Unexpected error during file upload: {e}")

In [None]:
try:
    # Initialize HDFS client
    hdfs_client = pyhdfs.HdfsClient(hosts=f"{HDFS_HOST}:{HDFS_PORT}")
    logging.info("HDFS client initialized")
    
    # Start uploading files
    upload_files_to_hdfs(LOCAL_RAW_DATA_PATH, HDFS_RAW_DEST_PATH, hdfs_client)
    
except pyhdfs.HdfsException as he:
    logging.error(f"HDFS error: Failed to initialize HDFS client or perform operations due to {he}")
except Exception as e:
    logging.error(f"Unexpected error: {e}")


In [18]:
import os
import time
import json
import pandas as pd
from kafka import KafkaProducer
from config.kafka_config import KAFKA_BROKER_URL, TOPIC_NAME_FL_DASHBOARD, BATCH_SIZE, POLL_INTERVAL

# Initialize Kafka producer
producer = KafkaProducer(
    bootstrap_servers=KAFKA_BROKER_URL,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')  # Serialize data to JSON
)

def read_data_in_batches(file_path, batch_size):
    """Reads data from file in batches."""
    """Reads data from an Excel file in batches."""
    df = pd.read_excel(file_path)
    for start in range(0, len(df), batch_size):
        yield df[start:start + batch_size].to_dict(orient='records')

def send_data_to_kafka(topic, file_path):
    """Sends data from the file to Kafka in batches."""
    for batch in read_data_in_batches(file_path, BATCH_SIZE):
        for record in batch:
            producer.send(topic, value=record)  # Send each record to Kafka
        producer.flush()  # Ensure data is sent before sleeping
        print(f"Sent batch of {len(batch)} records to Kafka topic '{topic}'")
        time.sleep(POLL_INTERVAL)

def main():
    # Path to raw data files
    raw_data_dir = os.path.join("data", "raw")
    
    # Send FL_Dashboard files
    for filename in os.listdir(raw_data_dir):
        if filename.startswith("FL_Dashboard"):
            file_path = os.path.join(raw_data_dir, filename)
            print(f"Streaming data from file: {file_path}")
            send_data_to_kafka(TOPIC_NAME_FL_DASHBOARD, file_path)

In [None]:
main()
producer.close()

Streaming data from file: data/raw/FL_Dashboard_AY2009_2010_Q1.xlsx
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Sent batch of 100 records to Kafka topic 'fl_dashboard_topic'
Se

In [None]:
# Kafka consumer to pull data from Kafka

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from config.kafka_config import KAFKA_BROKER_URL, TOPIC_NAME_FL_DASHBOARD, TOPIC_NAME_SCHOOL_CODELIST
from config.spark_config import APP_NAME, MASTER, BATCH_DURATION
import json

def start_spark_streaming():
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName(APP_NAME) \
        .master(MASTER) \
        .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    # Read from Kafka
    df_kafka = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BROKER_URL) \
        .option("subscribe", f"{TOPIC_NAME_FL_DASHBOARD},{TOPIC_NAME_SCHOOL_CODELIST}") \
        .option("startingOffsets", "earliest") \
        .load()

    # Convert Kafka's binary data to string
    df_kafka = df_kafka.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

    # Define schema or processing functions as needed for incoming data
    # This example assumes JSON strings; adjust according to your data format
    schema = "your_schema_here"  # Define your schema here

    df_parsed = df_kafka.withColumn("value", from_json(col("value"), schema))

    # Write the parsed data to the console (for debugging)
    query = df_parsed.writeStream \
        .outputMode("append") \
        .format("console") \
        .start()

    # Wait for the termination signal
    query.awaitTermination()


start_spark_streaming()