In [1]:
import time
import requests
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType


class RealTimeDataExtract:
    """
    Class to fetch real-time weather data using an external API.
    """

    def __init__(self, api_key, city):
        self.api_key = api_key
        self.city = city
        self.base_url = "http://api.openweathermap.org/data/2.5/weather"

    def fetch_data(self):
        params = {
            'q': self.city,
            'appid': self.api_key,
            'units': 'metric',  # Temperature in Celsius
        }
        response = requests.get(self.base_url, params=params)
        data = response.json()
        if data.get("cod") != 200:
            print("Error fetching data from API:", data.get("message"))
            return None
        return data


class DataTransform:
    """
    Class to transform raw API data into a PySpark DataFrame.
    """

    def __init__(self):
        # Initialize Spark Session
        self.spark = SparkSession.builder \
            .master("local[*]")\
            .appName("RealTimeWeatherPipeline") \
            .getOrCreate()
        print("Spark Master:", self.spark.sparkContext.master)

    def transform(self, raw_data):
        if raw_data is None:
            return None

        # Define schema for PySpark DataFrame
        schema = StructType([
            StructField("city", StringType(), True),
            StructField("timestamp", StringType(), True),
            StructField("temperature", DoubleType(), True),
            StructField("humidity", IntegerType(), True),
            StructField("pressure", IntegerType(), True),
            StructField("weather_description", StringType(), True),
        ])

        # Extract relevant fields
        weather_info = [{
            'city': raw_data['name'],
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'temperature': raw_data['main']['temp'],
            'humidity': raw_data['main']['humidity'],
            'pressure': raw_data['main']['pressure'],
            'weather_description': raw_data['weather'][0]['description'],
        }]

        # Convert to PySpark DataFrame
        spark_df = self.spark.createDataFrame(weather_info, schema=schema)
        return spark_df


class RealTimeDataProcess:
    """
    Class to calculate real-time metrics using PySpark operations.
    """

    def calculate(self, spark_df):
        if spark_df is None:
            return None

        # Example: Calculate average temperature and humidity
        metrics_df = spark_df \
            .withColumn("current_timestamp", current_timestamp()) \
            .groupBy("city") \
            .agg(
                avg("temperature").alias("avg_temperature"),
                avg("humidity").alias("avg_humidity")
            )

        return metrics_df


class DataStorage:
    """
    Class to store PySpark DataFrame results in a storage location.
    """

    def store(self, spark_df, file_path):
        if spark_df is None:
            print("No data to store.")
            return

        # Append data to a CSV file
        spark_df.write \
            .mode("append") \
            .option("header", "true") \
            .csv(file_path)
        print(f"Data stored successfully at: {file_path}")


In [2]:
class RealTimePipeline:
    """
    Main Real-Time Pipeline to orchestrate all components.
    """

    def __init__(self, api_key, city, file_path):
        self.api_key = api_key
        self.city = city
        self.file_path = file_path

    def run(self):
        """
        Runs the full pipeline: ingestion, transformation, metrics, and storage.
        """
        print(f"Running pipeline for city: {self.city}")

        # Step 1: Real-Time Data Ingestion
        ingestion = RealTimeDataExtract(self.api_key, self.city)
        raw_data = ingestion.fetch_data()
        if raw_data is None:
            print("No data fetched. Skipping this run.")
            return

        # Step 2: Data Transformation
        transformation = DataTransform()
        transformed_data = transformation.transform(raw_data)
        if transformed_data is None:
            print("Transformation failed. Skipping this run.")
            return

        transformed_data.show(truncate=False)

        # Step 3: Real-Time Metrics Calculation
        metrics_calculation = RealTimeDataProcess()
        metrics_data = metrics_calculation.calculate(transformed_data)
        if metrics_data is None:
            print("Metrics calculation failed. Skipping this run.")
            return

        print("Metrics Data:")
        metrics_data.show(truncate=False)

        # Step 4: Data Storage
        storage = DataStorage()
        storage.store(metrics_data, self.file_path)

        print(f"Pipeline run completed successfully for city: {self.city}")


if __name__ == "__main__":
    # API Key for OpenWeather
    API_KEY = "OPenWeatherAPIKey" # replace with your own API key
    CITY = 'London'  
    # FILE_PATH = 'real_time_weather_data_output'  # Directory to store CSV files
    FILE_PATH = 'data/real_time_weatherData_Spark.csv'

    # Initialize and run the pipeline every 10 seconds
    pipeline = RealTimePipeline(API_KEY, CITY, FILE_PATH)

    try:
        while True:
            pipeline.run()
            print("Sleeping for 10 seconds...")
            time.sleep(10)  # Fetch new data every 10 seconds
    except KeyboardInterrupt:
        print("Pipeline stopped manually.")


Running pipeline for city: London


24/12/22 20:54:21 WARN Utils: Your hostname, zhangmins-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.0.0.194 instead (on interface en0)
24/12/22 20:54:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/22 20:54:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Master: local[*]


                                                                                

+------+-------------------+-----------+--------+--------+-------------------+
|city  |timestamp          |temperature|humidity|pressure|weather_description|
+------+-------------------+-----------+--------+--------+-------------------+
|London|2024-12-22 20:54:24|7.18       |68      |1012    |few clouds         |
+------+-------------------+-----------+--------+--------+-------------------+

Metrics Data:


                                                                                

+------+---------------+------------+
|city  |avg_temperature|avg_humidity|
+------+---------------+------------+
|London|7.18           |68.0        |
+------+---------------+------------+



                                                                                

Data stored successfully at: data/real_time_weatherData_Spark.csv
Pipeline run completed successfully for city: London
Sleeping for 10 seconds...
Running pipeline for city: London
Spark Master: local[*]
+------+-------------------+-----------+--------+--------+-------------------+
|city  |timestamp          |temperature|humidity|pressure|weather_description|
+------+-------------------+-----------+--------+--------+-------------------+
|London|2024-12-22 20:54:47|7.18       |68      |1012    |few clouds         |
+------+-------------------+-----------+--------+--------+-------------------+

Metrics Data:
+------+---------------+------------+
|city  |avg_temperature|avg_humidity|
+------+---------------+------------+
|London|7.18           |68.0        |
+------+---------------+------------+

Data stored successfully at: data/real_time_weatherData_Spark.csv
Pipeline run completed successfully for city: London
Sleeping for 10 seconds...
Pipeline stopped manually.
