In [16]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.functions import expr
from prophet import Prophet
import pandas as pd

In [17]:
spark = SparkSession.builder.appName('AirQualityAnalysisIndia').getOrCreate()

In [18]:
df = spark.read.parquet('file:///home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/MiniProject/processed/enriched_air_quality.parquet')

In [19]:
pollutants = ["PM25", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene"]

stack_expr = "stack({}, {}) as (Pollutant, Value)".format(
    len(pollutants),
    ", ".join([f"'{p}', {p}" for p in pollutants])
)

In [20]:
df_long = df.selectExpr("City", "Date", stack_expr)

In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet
from pyspark.sql.functions import pandas_udf, PandasUDFType

# Define Pandas UDF with plotting
@pandas_udf("City string, Pollutant string, ds timestamp, yhat double, yhat_lower double, yhat_upper double", PandasUDFType.GROUPED_MAP)
def forecast_city_pollutant(pdf: pd.DataFrame) -> pd.DataFrame:
    city = pdf["City"].iloc[0]
    pollutant = pdf["Pollutant"].iloc[0]

    pdf = pdf.rename(columns={"Date": "ds", "Value": "y"})
    pdf["ds"] = pd.to_datetime(pdf["ds"])

    model = Prophet()
    model.fit(pdf[["ds", "y"]])

    future = model.make_future_dataframe(periods=365)
    forecast = model.predict(future)[["ds", "yhat", "yhat_lower", "yhat_upper"]]

    fig = model.plot(forecast)
    plt.title(f"{city} - {pollutant}", fontsize=14)

    folder_path = f"Graphs/Prophet/{city}"
    os.makedirs(folder_path, exist_ok=True)
    plt.savefig(f"{folder_path}/{pollutant}.jpg", bbox_inches="tight")
    plt.close(fig)

    forecast["City"] = city
    forecast["Pollutant"] = pollutant
    return forecast[["City", "Pollutant", "ds", "yhat", "yhat_lower", "yhat_upper"]]


In [7]:
forecast_df = df_long.groupBy("City", "Pollutant").apply(forecast_city_pollutant)



In [8]:
forecast_df = forecast_df.withColumnRenamed("ds", "Date").withColumnRenamed("yhat", "Value")
wide_df = (forecast_df.groupBy("City", "Date").pivot("Pollutant").agg(F.first("Value")).orderBy("City", "Date"))

# wide_df.show()

09:21:00 - cmdstanpy - INFO - Chain [1] start processing            (0 + 3) / 3]
09:21:00 - cmdstanpy - INFO - Chain [1] start processing
09:21:00 - cmdstanpy - INFO - Chain [1] start processing
09:21:01 - cmdstanpy - INFO - Chain [1] done processing
09:21:01 - cmdstanpy - INFO - Chain [1] done processing
09:21:01 - cmdstanpy - INFO - Chain [1] done processing
09:21:01 - cmdstanpy - INFO - Chain [1] start processing
09:21:02 - cmdstanpy - INFO - Chain [1] start processing
09:21:02 - cmdstanpy - INFO - Chain [1] done processing
09:21:02 - cmdstanpy - INFO - Chain [1] done processing
09:21:02 - cmdstanpy - INFO - Chain [1] start processing
09:21:02 - cmdstanpy - INFO - Chain [1] done processing
09:21:02 - cmdstanpy - INFO - Chain [1] start processing
09:21:02 - cmdstanpy - INFO - Chain [1] start processing
09:21:02 - cmdstanpy - INFO - Chain [1] done processing
09:21:03 - cmdstanpy - INFO - Chain [1] done processing
09:21:03 - cmdstanpy - INFO - Chain [1] start processing
09:21:03 - cmds

+---------+-------------------+------------------+------------------+-----+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+
|     City|               Date|           Benzene|                CO|  NH3|                NO|               NO2|               NOx|                O3|             PM10|             PM25|               SO2|           Toluene|
+---------+-------------------+------------------+------------------+-----+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+
|Ahmedabad|2015-01-01 00:00:00| 5.457270290186635| 18.95762964360299|15.85|17.590512007565735|24.712394960284083| 34.78856601162305| 41.82707758377222|97.55185731767993|88.99565153126326| 53.79634005837919|12.359103805117122|
|Ahmedabad|2015-01-02 00:00:00|  5.34752534309662|17.977241500761778|15.85|16.736985562907712|25

                                                                                

In [9]:
wide_df.write.mode("overwrite").parquet("file:///home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/MiniProject/output/forecast_results/")

09:23:32 - cmdstanpy - INFO - Chain [1] start processing
09:23:32 - cmdstanpy - INFO - Chain [1] start processing
09:23:32 - cmdstanpy - INFO - Chain [1] start processing
09:23:32 - cmdstanpy - INFO - Chain [1] done processing             (0 + 3) / 3]
09:23:32 - cmdstanpy - INFO - Chain [1] done processing
09:23:32 - cmdstanpy - INFO - Chain [1] done processing
09:23:33 - cmdstanpy - INFO - Chain [1] start processing
09:23:33 - cmdstanpy - INFO - Chain [1] start processing
09:23:33 - cmdstanpy - INFO - Chain [1] done processing
09:23:33 - cmdstanpy - INFO - Chain [1] done processing
09:23:33 - cmdstanpy - INFO - Chain [1] start processing
09:23:34 - cmdstanpy - INFO - Chain [1] done processing
09:23:34 - cmdstanpy - INFO - Chain [1] start processing
09:23:34 - cmdstanpy - INFO - Chain [1] done processing
09:23:34 - cmdstanpy - INFO - Chain [1] start processing
09:23:35 - cmdstanpy - INFO - Chain [1] done processing
09:23:35 - cmdstanpy - INFO - Chain [1] start processing
09:23:35 - cmd

In [21]:
spark.stop()