# 

# START ZOOKEPER SERVICE
### RUN THIS COMMAND IN A TERMINAL
```
/usr/local/kafka/bin/zookeeper-server-start.sh /usr/local/kafka/config/zookeeper.properties

```
# START KAFKA BROKERS
### RUN THIS COMMAND ON A DIFFERENT TERMINAL FOR EACH LINE
```
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server1.properties
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server2.properties
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server3.properties
/usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server4.properties
```
# LIST AVALABLE BROKERS
```
/usr/local/kafka/bin/zookeeper-shell.sh localhost:2181 ls /brokers/ids
```
# DELETE TOPICS
```
/usr/local/kafka/bin/kafka-topics.sh --bootstrap-server localhost:9092 --delete --topic animals-topic-streaming
```

# LISTS TOPICS
```
/usr/local/kafka/bin/kafka-topics.sh --list --bootstrap-server localhost:9092
```

# DEPENDENCIES

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

# Directory where JARs are located
jars_directory = "/usr/local/spark/jars/"

# List of JAR filenames
jar_files = [
    "commons-pool2-2.11.1.jar",
    "kafka-clients-3.3.2.jar",
    "spark-sql-kafka-0-10_2.12-3.4.1.jar",
    "spark-token-provider-kafka-0-10_2.12-3.4.1.jar"
]

dependencies = ",".join([os.path.join(jars_directory, jar) for jar in jar_files])

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-batch"

# Create Spark session and add JARs
spark_session = SparkSession.builder \
    .appName("WriteKafkaAnimals") \
    .config("spark.jars", dependencies) \
    .getOrCreate()

24/01/04 03:28:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# BATCH WRITING

In [2]:
def save_batch_data(spark_session, kafka_bootstrap_servers, topic, iterations=1, empty=0):

    # Create a sample DataFrame with animal data
    data = [("zebra", "mammal"), ("koala", "marsupial"), ("cheetah", "feline"),("dolphin", "mammal"),
            ("parrot", "bird"), ("rhino", "mammal"), ("panda", "mammal"), ("kangaroo", "marsupial"), 
            ("panther", "feline"), ("chimpanzee", "primate"), ("hippo", "mammal"), ("eagle", "bird"), 
            ("orangutan", "primate"), ("bear", "mammal"), ("owl", "bird"), ("polar bear", "mammal"), 
            ("snake", "reptile"), ("hawk", "bird"), ("fox", "mammal"), ("turtle", "reptile"), 
            ("swan", "bird"), ("jaguar", "feline"), ("seagull", "bird"), ("gazelle", "mammal")]

    values =  data if empty == 0 else [('', '')]
    
    columns = ["name", "type"]
    
    df_animals = spark_session.createDataFrame(values, columns)

    for iteration in range(iterations):
        # Write the DataFrame to Kafka topic
        df_animals.selectExpr("name as key", "type as value") \
            .write \
            .format("kafka") \
            .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
            .option("topic", topic) \
            .save()
    
    # Print a message indicating that the data has been written to the Kafka topic
    print(f"{iterations} Iterations, Data written to Kafka topic ({topic}).")

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-batch"
save_batch_data(spark_session, kafka_bootstrap_servers, topic)

topic = "animals-topic-streaming"
save_batch_data(spark_session=spark_session, kafka_bootstrap_servers=kafka_bootstrap_servers, topic=topic, empty=1)

                                                                                

1 Iterations, Data written to Kafka topic (animals-topic-batch).
1 Iterations, Data written to Kafka topic (animals-topic-streaming).


# BATCH READING

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define the hexadecimal decoding function
@udf(returnType=StringType())
def decode_hex(value):
    try:
        if isinstance(value, str):
            return bytes.fromhex(value).decode('utf-8')
        elif isinstance(value, bytearray):
            return bytes(value).decode('utf-8')
        else:
            return str(value)
    except (ValueError, UnicodeDecodeError):
        return str(value)
            
def read_batch_data(spark_session, kafka_bootstrap_servers, topic):

    # Try to read data from Kafka
    try:
        # Read data from Kafka
        df_kafka = spark_session \
            .read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
            .option("subscribe", topic) \
            .load()
    
        # Decode hexadecimal values
        df_decoded = df_kafka \
            .withColumn('key', decode_hex('key')) \
            .withColumn('value', decode_hex('value'))
    
        # Show the DataFrame with decoded data
        df_decoded.show(truncate=False)
    
    except Exception as e:
        if "UnknownTopicOrPartitionException" in str(e):
            print(f"The topic '{topic}' does not exist in the Kafka cluster.")
        else:
            print(f"Unexpected error: {e}")
    
    finally:
        # Stop the Spark session
        None

topic = "animals-topic-batch"
read_batch_data(spark_session, kafka_bootstrap_servers, topic)

24/01/04 03:28:20 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/01/04 03:28:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/04 03:28:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/04 03:28:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/04 03:28:21 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
24/01/04 03:28:21 WARN KafkaDataConsumer: KafkaDataConsumer is not 

+----------+---------+-------------------+---------+------+-----------------------+-------------+
|key       |value    |topic              |partition|offset|timestamp              |timestampType|
+----------+---------+-------------------+---------+------+-----------------------+-------------+
|snake     |reptile  |animals-topic-batch|0        |0     |2024-01-04 01:26:40.099|0            |
|jaguar    |feline   |animals-topic-batch|0        |1     |2024-01-04 01:26:40.099|0            |
|dolphin   |mammal   |animals-topic-batch|0        |2     |2024-01-04 01:26:40.099|0            |
|kangaroo  |marsupial|animals-topic-batch|0        |3     |2024-01-04 01:26:40.099|0            |
|bear      |mammal   |animals-topic-batch|0        |4     |2024-01-04 01:26:40.099|0            |
|polar bear|mammal   |animals-topic-batch|0        |5     |2024-01-04 01:26:40.099|0            |
|fox       |mammal   |animals-topic-batch|0        |6     |2024-01-04 01:26:40.099|0            |
|koala     |marsupia

# STREAMING WRITING
## IT WILL BE LISTENING TO THE "animals-topic-batch" TOPIC
## WHEN NEW DATA ARRIVES, IT READS AND STORES THEM IN THE "animals-topic-streaming" TOPIC
## MAKE THE TRANSFORMATIONS

In [4]:
from pyspark.sql.functions import expr

def read_streaming_data(spark_session, kafka_bootstrap_servers, input_topic, output_topic, checkpoint_location, files_directory):

    #spark_session.sparkContext.setLogLevel("DEBUG")    
    # Read from Kafka in streaming mode
    kafkaStream = spark_session \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", input_topic) \
        .option("startingOffsets", "earliest") \
        .load()
    
    # Perform some transformation on the data (here we are simply renaming the columns and uppercase the values)
    transformedStream = kafkaStream.selectExpr( "CAST(key AS STRING) as key","UPPER(CAST(value AS STRING)) as value")

    # save into parquet file
    query_parquet = transformedStream \
      .writeStream \
      .outputMode("append") \
      .format("parquet") \
      .option("path", files_directory) \
      .option("checkpointLocation", checkpoint_location) \
      .trigger(processingTime="1 minute") \
      .start()

    query = transformedStream \
        .writeStream \
        .option("failOnDataLoss", "false") \
        .outputMode("append") \
        .format("kafka") \
        .option("truncate", "false") \
        .option("checkpointLocation", checkpoint_location) \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("topic", output_topic) \
        .start()

    print('kafkaStream', type(kafkaStream))
    print('transformedStream', type(transformedStream))
    print('query', type(query))

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
input_topic = "animals-topic-batch"
output_topic = "animals-topic-streaming"
# Checkpoint directory within the Kafka directory
checkpoint_location = "/usr/local/kafka/data/checkpoint"
files_directory = "/usr/local/kafka/data/files"

read_streaming_data(spark_session, kafka_bootstrap_servers, input_topic, output_topic, checkpoint_location, files_directory)

kafkaStream <class 'pyspark.sql.dataframe.DataFrame'>
transformedStream <class 'pyspark.sql.dataframe.DataFrame'>
query <class 'pyspark.sql.streaming.query.StreamingQuery'>


24/01/04 03:28:21 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/01/04 03:28:21 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/01/04 03:28:21 WARN StreamingQueryManager: Stopping existing streaming query [id=ae0c5d8c-a9e8-48d5-8a25-09d7ba314974, runId=c05f85b9-a9d9-4b8d-8edf-f4c7f80ccfbd], as a new run is being started.


## MASSIVE DATA INSERTION TO KAFKA FOR STREAMING READ
### COPY, PASTE, AND RUN THE FOLLOWING CODE IN ANOTHER NOTEBOOK TO OBSERVE STREAMING REDIN


```python
import os
from pyspark.sql import SparkSession
from time import sleep

# Directory where JARs are located
jars_directory = "/usr/local/spark/jars/"

# List of JAR filenames
jar_files = [
    "commons-pool2-2.11.1.jar",
    "kafka-clients-3.3.2.jar",
    "spark-sql-kafka-0-10_2.12-3.4.1.jar",
    "spark-token-provider-kafka-0-10_2.12-3.4.1.jar"
]

dependencies = ",".join([os.path.join(jars_directory, jar) for jar in jar_files])

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-batch"


# Create Spark session and add JARs
spark_session = SparkSession.builder \
    .appName("WriteKafkaAnimals") \
    .config("spark.jars", dependencies) \
    .getOrCreate()


def save_batch_data(spark_session, kafka_bootstrap_servers, topic, iterations=1):

    # Create a sample DataFrame with animal data
    data = [("zebra", "mammal"), ("koala", "marsupial"), ("cheetah", "feline"),("dolphin", "mammal"),
            ("parrot", "bird"), ("rhino", "mammal"), ("panda", "mammal"), ("kangaroo", "marsupial"), 
            ("panther", "feline"), ("chimpanzee", "primate"), ("hippo", "mammal"), ("eagle", "bird"), 
            ("orangutan", "primate"), ("bear", "mammal"), ("owl", "bird"), ("polar bear", "mammal"), 
            ("snake", "reptile"), ("hawk", "bird"), ("fox", "mammal"), ("turtle", "reptile"), 
            ("swan", "bird"), ("jaguar", "feline"), ("seagull", "bird"), ("gazelle", "mammal")]
    
    columns = ["name", "type"]
    
    df_animals = spark_session.createDataFrame(data, columns)

    for iteration in range(iterations):
        sleep(0.2)
        # Write the DataFrame to Kafka topic
        df_animals.selectExpr("name as key", "type as value") \
            .write \
            .format("kafka") \
            .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
            .option("topic", topic) \
            .save()
        print (f'Iteration {iteration}, completed!!!')
    
    # Print a message indicating that the data has been written to the Kafka topic
    print(f"{iterations} Iterations, Data written to Kafka topic ({topic}).")

     # Finally, stop the Spark session
    spark_session.stop()

save_batch_data(spark_session, kafka_bootstrap_servers, topic, iterations=13)
```

# STREAMING READING

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Configure Kafka connection
kafka_bootstrap_servers = "localhost:9092"
topic = "animals-topic-streaming"

def read_streaming_data(spark_session,kafka_bootstrap_servers, topic):
   
    # Read from Kafka in streaming mode
    kafkaStream = spark_session.readStream.format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", topic) \
        .option("startingOffsets", "earliest") \
        .load()
    
    # Show the read content
    query = kafkaStream.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
        .writeStream \
        .outputMode("append") \
        .format("console") \
        .start() \
    
    # Wait for the stream to finish (adjust as needed)
    #query.awaitTermination()
    
    # Finally, stop the Spark session (you can stop it after the stream finishes)
    #spark_session.stop()

read_streaming_data(spark_session, kafka_bootstrap_servers, topic)

24/01/04 03:28:21 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/01/04 03:28:21 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-4c0af553-e8a0-4683-9093-44353aa1711b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/01/04 03:28:21 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/01/04 03:28:21 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+----------+---------+
|       key|    value|
+----------+---------+
|          |         |
|          |         |
|          |         |
|     snake|  REPTILE|
|    jaguar|   FELINE|
|   dolphin|   MAMMAL|
|  kangaroo|MARSUPIAL|
|      bear|   MAMMAL|
|polar bear|   MAMMAL|
|       fox|   MAMMAL|
|     koala|MARSUPIAL|
|     zebra|   MAMMAL|
|     hippo|   MAMMAL|
|   cheetah|   FELINE|
|chimpanzee|  PRIMATE|
|     eagle|     BIRD|
|    parrot|     BIRD|
| orangutan|  PRIMATE|
|    turtle|  REPTILE|
|     panda|   MAMMAL|
+----------+---------+
only showing top 20 rows

-------------------------------------------
Batch: 1
-------------------------------------------
+----------+---------+
|       key|    value|
+----------+---------+
|   seagull|     BIRD|
|   gazelle|   MAMMAL|
|chimpanzee|  PRIMATE|
|    jaguar|   FELINE|
|     snake|  REPTILE|
|      hawk|     BIRD|
|    turtle|  REPTILE

# STREAMING FILES USING FROM DIRECTORIES

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import expr
import shutil
from pyspark.sql import functions as F
from datetime import datetime
from pyspark.sql.functions import lit

def process_batch(directory_to_save_files, df, epoch_id):
    
    # Generate a timestamp
    timestamp_str = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

    # Add epoch_id and timestamp_str as new columns
    df = df.withColumn("batch_id", lit(epoch_id)) \
           .withColumn("timestamp", lit(timestamp_str))

    # Print information about the new batch
    print(f"Processing microbatch {epoch_id} at {timestamp_str}")

    # Group by specified columns and calculate the average ticket price
    grouped_df = df.groupBy(
        "passenger_nationality",
        "passenger_gender",
        "passenger_age",
        "batch_id",
        "timestamp" 
    ).agg(
        F.avg("ticket_price").alias("avg_ticket_price"),
        F.sum("ticket_price").alias("total_ticket_price"),
        F.count("passenger_name").alias("total_passengers")
    )

    # Filter out groups where the total number of passengers is greater than 1
    filtered_df = grouped_df.filter(grouped_df.total_passengers > 1)

    # Sort the DataFrame by passenger_nationality and passenger_age
    sorted_df = filtered_df.orderBy("passenger_nationality", "passenger_gender")

    # Show the sorted DataFrame
    sorted_df.show(truncate=False)

    # Coalesce to a single partition and write the DataFrame as Parquet with timestamp
    maximum_parquet_files_per_batch  = 3
    sorted_df.coalesce(maximum_parquet_files_per_batch ).write.parquet(f"{directory_to_save_files}/microbatch_{epoch_id}_{timestamp_str}")

    # Show the original DataFrame
    df.show()

def read_file_like_streaming(spark_session, customSchema, format, checkpoint_location, directory_to_save_files):
    
    # Read the DataFrame as a continuous stream
    streaming_df = spark_session \
                    .readStream \
                    .schema(customSchema) \
                    .format(format) \
                    .option("header", "true") \
                    .load(folder_files_path)

    # Display the DataFrame in the console and count the records per microbatch
    # .trigger(processingTime='5 seconds') specifies that Spark Structured Streaming should process micro-batches of data from the specified directory every 5 seconds.
    query = streaming_df \
        .writeStream \
        .outputMode("append") \
        .trigger(processingTime='5 seconds') \
        .option("checkpointLocation", checkpoint_location) \
        .option("basePath", directory_to_save_files) \
        .foreachBatch(lambda df, epoch_id: process_batch(directory_to_save_files, df, epoch_id)) \
        .start()
    
    # Wait for the streaming to complete
    #query.awaitTermination()

format = 'csv'
folder_files_path = "/notebooks"
base_path = "/usr/local/kafka/data/files/batch"
shutil.rmtree(base_path, ignore_errors=True)

# Delete the checkpoint directory
checkpoint_location = "/usr/local/kafka/data/batch"
shutil.rmtree(checkpoint_location, ignore_errors=True)

# Define the schema
customSchema = StructType([
    StructField("id", IntegerType(), True),
    StructField("secure_code", StringType(), True),
    StructField("airline", StringType(), True),
    StructField("departure_city", StringType(), True),
    StructField("departure_date", StringType(), True),
    StructField("arrival_airport", StringType(), True),
    StructField("arrival_city", StringType(), True),
    StructField("arrival_time", StringType(), True),
    StructField("passenger_name", StringType(), True),
    StructField("passenger_gender", StringType(), True),
    StructField("seat_number", StringType(), True),
    StructField("currency", StringType(), True),
    StructField("departure_gate", StringType(), True),
    StructField("flight_status", StringType(), True),
    StructField("co_pilot_name", StringType(), True),
    StructField("aircraft_type", StringType(), True),
    StructField("fuel_consumption", DoubleType(), True),
    StructField("flight_id", IntegerType(), True),
    StructField("flight_number", IntegerType(), True),
    StructField("departure_airport", StringType(), True),
    StructField("departure_country", StringType(), True),
    StructField("departure_time", StringType(), True),
    StructField("arrival_country", StringType(), True),
    StructField("arrival_date", StringType(), True),
    StructField("flight_duration", DoubleType(), True),
    StructField("passenger_age", IntegerType(), True),
    StructField("passenger_nationality", StringType(), True),
    StructField("ticket_price", DoubleType(), True),
    StructField("baggage_weight", DoubleType(), True),
    StructField("arrival_gate", StringType(), True),
    StructField("pilot_name", StringType(), True),
    StructField("cabin_crew_count", IntegerType(), True),
    StructField("aircraft_registration", StringType(), True),
    StructField("flight_distance", DoubleType(), True)
])


directory_to_save_files = "/notebooks"

read_file_like_streaming(spark_session=spark_session, customSchema=customSchema, format=format, checkpoint_location=checkpoint_location, directory_to_save_files=directory_to_save_files)

24/01/04 03:28:49 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/01/04 03:28:49 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Processing microbatch 0 at 2024_01_04_03_28_49


24/01/04 03:28:50 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 1, schema size: 34
CSV file: file:///notebooks/Spark-Kafka-Batch-Streaming.ipynb
24/01/04 03:28:50 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 1, schema size: 34
CSV file: file:///notebooks/Untitled.ipynb


+---------------------+----------------+-------------+--------+-------------------+----------------+------------------+----------------+
|passenger_nationality|passenger_gender|passenger_age|batch_id|timestamp          |avg_ticket_price|total_ticket_price|total_passengers|
+---------------------+----------------+-------------+--------+-------------------+----------------+------------------+----------------+
|null                 |null            |null         |0       |2024_01_04_03_28_49|null            |null              |10              |
+---------------------+----------------+-------------+--------+-------------------+----------------+------------------+----------------+



24/01/04 03:28:51 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 1, schema size: 34
CSV file: file:///notebooks/Untitled.ipynb
24/01/04 03:28:51 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 1, schema size: 34
CSV file: file:///notebooks/Spark-Kafka-Batch-Streaming.ipynb
24/01/04 03:28:52 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 1, schema size: 34
CSV file: file:///notebooks/Spark-Kafka-Batch-Streaming.ipynb


+----+-----------+-------+--------------+--------------+---------------+------------+------------+--------------+----------------+-----------+--------+--------------+-------------+-------------+-------------+----------------+---------+-------------+-----------------+-----------------+--------------+---------------+------------+---------------+-------------+---------------------+------------+--------------+------------+----------+----------------+---------------------+---------------+--------+-------------------+
|  id|secure_code|airline|departure_city|departure_date|arrival_airport|arrival_city|arrival_time|passenger_name|passenger_gender|seat_number|currency|departure_gate|flight_status|co_pilot_name|aircraft_type|fuel_consumption|flight_id|flight_number|departure_airport|departure_country|departure_time|arrival_country|arrival_date|flight_duration|passenger_age|passenger_nationality|ticket_price|baggage_weight|arrival_gate|pilot_name|cabin_crew_count|aircraft_registration|flight_dis

# COPY, PASTE, AND RUN THE FOLLOWING CODE IN ANOTHER NOTEBOOK TO OBSERVE STREAMING REDING

```python
import pandas as pd
import numpy as np
from time import sleep

for index in range(1,6):

    part_1 = index
    part_2 = index + 5

    file_name = 'flight_logs'
    final_file = f'{file_name}_{index}.csv'
    
    file_1 = f'{file_name}_part_1_{index}.csv'
    base_url1 = f'https://raw.githubusercontent.com/JorgeCardona/recursos/main/datasets/{file_1}'
    df1 = pd.read_csv(base_url1)
    
    file_2 = f'{file_name}_part_2_{index}.csv'
    base_url2 = f'https://raw.githubusercontent.com/JorgeCardona/recursos/main/datasets/{file_2}'
    df2 = pd.read_csv(base_url2)

    df3 = pd.merge(df1, df2, left_on='id', right_on='flight_id', how='inner')
    df3.to_csv(f'{final_file}',index=False)

    print(f'{final_file} saved Successfully!!')
    sleep(10)
```

# LOAD PARQUET FILES DIRECTORY ON DATAFRAME

In [7]:
import pandas as pd
import pyarrow.parquet as pq
import glob

def read_parquet_directory_into_dataframe(directory_path):
    """
    Reads all Parquet files from a directory into a single pandas DataFrame.

    Parameters:
    - directory_path (str): Path to the directory containing Parquet files.

    Returns:
    - pd.DataFrame: Combined DataFrame containing data from all Parquet files.
    """
    # Use glob to get a list of all Parquet file paths in the specified directory
    parquet_files = glob.glob(f'{directory_path}/*.parquet')
    
    # Initialize an empty list to store individual DataFrames
    dfs = []
    
    # Loop through each Parquet file and read it into a DataFrame
    for parquet_file in parquet_files:
        # Read Parquet file into a DataFrame
        df = pq.read_table(parquet_file).to_pandas()
        
        # Append the DataFrame to the list
        dfs.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    all_data = pd.concat(dfs, ignore_index=True)
    
    return all_data

# Specify the directory where Parquet files are located
directory_path = 'microbatch_5_2024_01_04_03_29_45/'

read_parquet_directory_into_dataframe(directory_path)

Unnamed: 0,passenger_nationality,passenger_gender,passenger_age,batch_id,timestamp,avg_ticket_price,total_ticket_price,total_passengers
0,Poland,Female,21,5,2024_01_04_03_29_45,202.870,405.74,2
1,Poland,Male,90,5,2024_01_04_03_29_45,438.330,876.66,2
2,Poland,Male,80,5,2024_01_04_03_29_45,741.635,1483.27,2
3,Portugal,Female,47,5,2024_01_04_03_29_45,428.000,856.00,2
4,Portugal,Female,79,5,2024_01_04_03_29_45,740.210,1480.42,2
...,...,...,...,...,...,...,...,...
76,China,Male,95,5,2024_01_04_03_29_45,402.975,805.95,2
77,China,Male,76,5,2024_01_04_03_29_45,475.855,951.71,2
78,Colombia,Female,83,5,2024_01_04_03_29_45,679.375,1358.75,2
79,France,Male,45,5,2024_01_04_03_29_45,710.710,1421.42,2


# CLOSE SPARK SESSION

In [8]:
spark_session.stop()