##### **Connecting to the storage account and container to access files**

In [40]:
# connecting to storage account
storage_account_name = "harshshahstorage"
storage_account_key = "3Dy1d9uhwVwMNbaF2LsVuYGk31VaqdAs07t2RHy7Lq5fQwum/iXauc+eXslf5I+7WFzC03Mug1m1+ASte+XCnQ=="

spark.conf.set(
    "fs.azure.account.key." + storage_account_name + ".dfs.core.windows.net",
    storage_account_key
)

In [42]:
# Access to ADLS container

# Read data directly from ADLS Gen2
container_name = "harshcontainer"
air_pollution_folder_path = "Bronze/AirPollutionData/2.5/air_pollution"  # Folder path
historical_weather_folder_path = "Bronze/WeatherAPIData/2.5/history"

# Read all JSON files in the folder
air_pollution_df = spark.read.json(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{air_pollution_folder_path}")
historical_weather_df = spark.read.json(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{historical_weather_folder_path}")

In [43]:
#check airpollution and weather dataframe
air_pollution_df.head()
historical_weather_df.head()

In [44]:
# Schema for airpollution and weather
air_pollution_df.printSchema()
historical_weather_df.printSchema()

##### **Cleaning json data format to make it well structured to use it for analysis later**

In [45]:
from pyspark.sql.functions import col, explode, from_unixtime

air_pollution_flatten = air_pollution_df.withColumn("list", explode(col("list")))

#columns
air_pollution_flatten = air_pollution_flatten.select(
    col("coord.lat").alias("latitude"),
    col("coord.lon").alias("longitude"),
    from_unixtime(col("list.dt")).alias("date"),
    col("list.main.aqi").alias("AQI"),
    col("list.components.co").alias("CO"),
    col("list.components.nh3").alias("NH3"),
    col("list.components.no").alias("NO"),
    col("list.components.no2").alias("NO2"),
    col("list.components.o3").alias("O3"),
    col("list.components.pm10").alias("PM10"),
    col("list.components.pm2_5").alias("PM2.5"),
    col("list.components.so2").alias("SO2")
)

# DataFrame
air_pollution_flatten.show(5)


In [46]:
weather_flatten = historical_weather_df.withColumn("list", explode(col("list")))

# Select required fields
weather_flatten = weather_flatten.select(
    col("city_id").alias("city_id"),
    from_unixtime(col("list.dt")).alias("date"),
    col("list.main.temp").alias("temperature"),
    col("list.main.feels_like").alias("feels_like"),
    col("list.main.temp_min").alias("temp_min"),
    col("list.main.temp_max").alias("temp_max"),
    col("list.main.humidity").alias("humidity"),
    col("list.main.pressure").alias("pressure"),
    col("list.wind.speed").alias("wind_speed"),
    col("list.wind.deg").alias("wind_direction"),
    col("list.clouds.all").alias("cloud_coverage"),
    col("list.weather").getItem(0).getField("main").alias("weather_condition"),
    col("list.weather").getItem(0).getField("description").alias("weather_description")
)

#DataFrame
weather_flatten.show(5)


##### **Converting to Pandas DF and checking if the data is well structured**

In [47]:
# convert spark df to pandas df
weather_pd_df = weather_flatten.toPandas()
air_pollution_pd_df = air_pollution_flatten.toPandas()

In [49]:
#check weather pandas dataframe
weather_pd_df.head(10)

In [50]:
#check air pollution pandas dataframe
air_pollution_pd_df.head(10)

##### **Air Pollution Data Analysis**

In [51]:
import matplotlib.pyplot as plt
import seaborn as sns

#Checking Air Quality Index

plt.figure(figsize=(8, 5))
sns.histplot(air_pollution_pd_df["AQI"], bins=20, kde=True, color="red")
plt.title("Distribution of AQI Levels")
plt.xlabel("AQI")
plt.ylabel("Frequency")
plt.show()

In [59]:
#carbon monoxide at AQI level

plt.figure(figsize=(8, 5))
sns.scatterplot(x=air_pollution_pd_df["CO"], y=air_pollution_pd_df["AQI"], alpha=0.6, color="purple")
plt.title("Scatter Plot: AQI vs CO Levels")
plt.xlabel("CO Concentration (µg/m³)")
plt.ylabel("AQI")
plt.show()

In [60]:
# Avg pollution level of pollutants

pollutants = ["CO", "NO", "NO2", "O3", "PM10", "PM2.5", "SO2", "NH3"]

avg_pollution = air_pollution_pd_df[pollutants].mean()

plt.figure(figsize=(10, 5))
sns.barplot(x=avg_pollution.index, y=avg_pollution.values, palette="viridis")
plt.title("Average Concentration of Different Pollutants")
plt.xlabel("Pollutants")
plt.ylabel("Average Concentration (µg/m³)")
plt.show()

##### **Historical Weather Data Analysis**

In [52]:
# Temperature distribution

plt.figure(figsize=(8, 5))
sns.histplot(weather_pd_df["temperature"], bins=30, kde=True, color="royalblue")
plt.title("Distribution of Temperature")
plt.xlabel("Temperature (°F)")
plt.ylabel("Frequency")
plt.show()

In [53]:
# Temperature v/s Humidity

plt.figure(figsize=(8, 5))
sns.scatterplot(x=weather_pd_df["temperature"], y=weather_pd_df["humidity"], alpha=0.6, color="green")
plt.title("Scatter Plot: Temperature vs Humidity")
plt.xlabel("Temperature (°F)")
plt.ylabel("Humidity (%)")
plt.show()

In [54]:
# Temperature Variations

plt.figure(figsize=(8, 5))
sns.boxplot(y=weather_pd_df["temperature"], color="orange")
plt.title("Box Plot of Temperature")
plt.ylabel("Temperature (°F)")
plt.show()

In [56]:
import pandas as pd

# Monthly weather trends

weather_pd_df["date"] = pd.to_datetime(weather_pd_df["date"])
weather_monthly = weather_pd_df.resample("M", on="date").mean()
weather_monthly["month"] = weather_monthly.index.strftime("%b")  # 'Jan', 'Feb', etc.

# Plot
plt.figure(figsize=(10, 5))
plt.plot(weather_monthly.index, weather_monthly["temperature"], marker="o", linestyle="-", color="red", alpha=0.7)
plt.xticks(weather_monthly.index, weather_monthly["month"], rotation=45)  
plt.title("Monthly Average Temperature Trend")
plt.xlabel("Month")
plt.ylabel("Average Temperature (°F)")
plt.grid(True)
plt.show()

In [57]:
# Correlation heatmap

plt.figure(figsize=(8, 6))
sns.heatmap(weather_pd_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Weather Variables")
plt.show()

In [75]:
# most common weather conditions

weather_counts = weather_pd_df["weather_condition"].value_counts()

filtered_weather_counts = weather_counts[weather_counts >= 100]

plt.figure(figsize=(8, 5))
plt.pie(
    filtered_weather_counts, 
    labels=filtered_weather_counts.index,  
    autopct="%1.1f%%", 
    colors=sns.color_palette("pastel"), 
    pctdistance=0.75,  
    startangle=140,  
    wedgeprops={"edgecolor": "black", "linewidth": 1} 
)
plt.title("Most Common Weather Conditions Distribution")
plt.show()
