In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, countDistinct, round, when, isnan

In [2]:
spark = SparkSession.builder \
    .appName("Air Quality Analysis") \
    .getOrCreate()

In [3]:
file_path = "air_quality_data.csv"  
df = spark.read.csv(file_path, header=True, inferSchema=True)

print("Schema:")
df.printSchema()

print("\nSample data:")
df.show(5, truncate=False)

Schema:
root
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- station: string (nullable = true)
 |-- last_update: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- pollutant_id: string (nullable = true)
 |-- pollutant_min: string (nullable = true)
 |-- pollutant_max: string (nullable = true)
 |-- pollutant_avg: string (nullable = true)


Sample data:
+-------+-----+----+--------------------------+-------------------+---------+---------+------------+-------------+-------------+-------------+
|country|state|city|station                   |last_update        |latitude |longitude|pollutant_id|pollutant_min|pollutant_max|pollutant_avg|
+-------+-----+----+--------------------------+-------------------+---------+---------+------------+-------------+-------------+-------------+
|India  |Bihar|Gaya|Collectorate, Gaya - BSPCB|07-10-2025 10:00:00|24.7955  |84.9994  |SO

In [4]:
df_clean = df.replace('NA', None)
df_clean = df_clean.na.drop(subset=["pollutant_avg"])

In [5]:
unique_stations = df_clean.select(countDistinct("station")).collect()[0][0]
print(f"\nNumber of unique monitoring stations: {unique_stations}")


Number of unique monitoring stations: 470


In [6]:
print("\nAverage pollutant levels by state:")
df_clean.groupBy("state").agg(
    round(avg("pollutant_avg"), 2).alias("Avg_Pollutant_Level")
).orderBy(col("Avg_Pollutant_Level").desc()).show(10)


Average pollutant levels by state:
+----------------+-------------------+
|           state|Avg_Pollutant_Level|
+----------------+-------------------+
|Himachal Pradesh|              44.71|
|       Jharkhand|               40.0|
|           Delhi|              33.92|
|     West_Bengal|               31.2|
|   Uttar_Pradesh|              30.49|
|  Andhra_Pradesh|              30.15|
|  Madhya Pradesh|              29.93|
|         Haryana|              28.65|
|          Odisha|              28.36|
|          Kerala|              28.27|
+----------------+-------------------+
only showing top 10 rows


In [7]:
print("\nMost common pollutants monitored:")
df_clean.groupBy("pollutant_id").count().orderBy(col("count").desc()).show()


Most common pollutants monitored:
+------------+-----+
|pollutant_id|count|
+------------+-----+
|          CO|  445|
|       OZONE|  436|
|         NO2|  431|
|         SO2|  419|
|        PM10|  415|
|       PM2.5|  409|
|         NH3|  371|
+------------+-----+



In [8]:
print("\nMin/Max/Average values for each pollutant:")
df_clean.groupBy("pollutant_id").agg(
    round(min("pollutant_min"), 2).alias("Min_Value"),
    round(max("pollutant_max"), 2).alias("Max_Value"),
    round(avg("pollutant_avg"), 2).alias("Avg_Value")
).orderBy(col("Avg_Value").desc()).show(10)


Min/Max/Average values for each pollutant:
+------------+---------+---------+---------+
|pollutant_id|Min_Value|Max_Value|Avg_Value|
+------------+---------+---------+---------+
|        PM10|      1.0|     99.0|    55.75|
|       PM2.5|      1.0|     99.0|    42.61|
|          CO|      1.0|     98.0|    28.93|
|         NO2|      1.0|     97.0|    20.74|
|       OZONE|      1.0|     97.0|    19.58|
|         SO2|      1.0|     96.0|    13.25|
|         NH3|      1.0|      9.0|     4.53|
+------------+---------+---------+---------+



In [9]:
print("\nTop polluted cities (based on average pollutant value):")
df_clean.groupBy("city").agg(
    round(avg("pollutant_avg"), 2).alias("City_Avg_Pollution")
).orderBy(col("City_Avg_Pollution").desc()).show(10)


Top polluted cities (based on average pollutant value):
+--------------+------------------+
|          city|City_Avg_Pollution|
+--------------+------------------+
|Chikkaballapur|              88.5|
| Gummidipoondi|              67.6|
|    Mandikhera|             56.75|
|       Dhanbad|              48.0|
|        Ratlam|              48.0|
| Visakhapatnam|             47.29|
|      Byrnihat|             46.43|
|       Nalbari|             46.43|
|        Maihar|              46.2|
|         Baddi|             44.71|
+--------------+------------------+
only showing top 10 rows


In [10]:
print("\nCritical pollutants (avg level > 50):")
df_clean.filter(col("pollutant_avg") > 50).select(
    "state", "city", "station", "pollutant_id", "pollutant_avg"
).show(10)


Critical pollutants (avg level > 50):
+--------------+------------+--------------------+------------+-------------+
|         state|        city|             station|pollutant_id|pollutant_avg|
+--------------+------------+--------------------+------------+-------------+
|         Bihar|     Bettiah|Kamalnath Nagar, ...|        PM10|           64|
|         Bihar|Bihar Sharif|D M Colony, Bihar...|        PM10|           68|
|Andhra_Pradesh|   Anantapur|Gulzarpet, Ananta...|        PM10|           59|
|         Bihar|       Arrah|New DM Office, Ar...|          CO|           57|
|Andhra_Pradesh|  Vijayawada|Rajiv Gandhi Park...|       PM2.5|           56|
|         Assam|      Nagaon|Christianpatty, N...|       PM2.5|           56|
|         Assam|     Nalbari|Bata Chowk, Nalba...|        PM10|           70|
|         Bihar|  Samastipur|DM Office_Kasipur...|       PM2.5|           83|
|         Bihar|      Rajgir|Dangi Tola, Rajgi...|        PM10|           87|
|         Delhi|       De

In [11]:
df_aqi = df_clean.withColumn(
    "AQI_Category",
    when(col("pollutant_avg") <= 30, "Good")
    .when((col("pollutant_avg") > 30) & (col("pollutant_avg") <= 60), "Moderate")
    .when((col("pollutant_avg") > 60) & (col("pollutant_avg") <= 90), "Poor")
    .otherwise("Severe")
)

In [12]:
print("\nAQI categories for each record:")
df_aqi.select("state", "city", "pollutant_id", "pollutant_avg", "AQI_Category").show(10)


AQI categories for each record:
+-----+-------+------------+-------------+------------+
|state|   city|pollutant_id|pollutant_avg|AQI_Category|
+-----+-------+------------+-------------+------------+
|Bihar|   Gaya|         SO2|           11|        Good|
|Bihar|   Gaya|       OZONE|           20|        Good|
|Bihar|   Gaya|       PM2.5|           31|    Moderate|
|Bihar|   Gaya|         NO2|            5|        Good|
|Bihar|   Gaya|          CO|           40|    Moderate|
|Bihar|   Gaya|       OZONE|            6|        Good|
|Bihar|   Gaya|        PM10|           33|    Moderate|
|Bihar|   Gaya|         NH3|            3|        Good|
|Bihar|Hajipur|         NH3|            5|        Good|
|Bihar|Hajipur|         SO2|            4|        Good|
+-----+-------+------------+-------------+------------+
only showing top 10 rows


In [14]:
spark.stop()