In [54]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col, when

In [33]:
spark = SparkSession.builder.appName("AirQualityAnalysisIndia").getOrCreate()

In [34]:
df = spark.read.csv('file:///home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/MiniProject/datasets/air_quality_data/city_day.csv', header=True, inferSchema=True)

In [35]:
df.show(5)

+---------+----------+----+----+----+-----+-----+----+----+-----+------+-------+-------+------+----+----------+
|     City|      Date|PM25|PM10|  NO|  NO2|  NOx| NH3|  CO|  SO2|    O3|Benzene|Toluene|Xylene| AQI|AQI_Bucket|
+---------+----------+----+----+----+-----+-----+----+----+-----+------+-------+-------+------+----+----------+
|Ahmedabad|2015-01-01|NULL|NULL|0.92|18.22|17.15|NULL|0.92|27.64|133.36|    0.0|   0.02|   0.0|NULL|      NULL|
|Ahmedabad|2015-01-02|NULL|NULL|0.97|15.69|16.46|NULL|0.97|24.55| 34.06|   3.68|    5.5|  3.77|NULL|      NULL|
|Ahmedabad|2015-01-03|NULL|NULL|17.4| 19.3| 29.7|NULL|17.4|29.07|  30.7|    6.8|   16.4|  2.25|NULL|      NULL|
|Ahmedabad|2015-01-04|NULL|NULL| 1.7|18.48|17.97|NULL| 1.7|18.59| 36.08|   4.43|  10.14|   1.0|NULL|      NULL|
|Ahmedabad|2015-01-05|NULL|NULL|22.1|21.42|37.76|NULL|22.1|39.33| 39.31|   7.01|  18.89|  2.78|NULL|      NULL|
+---------+----------+----+----+----+-----+-----+----+----+-----+------+-------+-------+------+----+----

In [36]:
df.groupBy('AQI_Bucket').count().show()

+------------+-----+
|  AQI_Bucket|count|
+------------+-----+
|        NULL| 4681|
|        Good| 1341|
|      Severe| 1338|
|   Very Poor| 2337|
|Satisfactory| 8224|
|        Poor| 2781|
|    Moderate| 8829|
+------------+-----+



In [37]:
df = df.na.drop(how='any', subset=['AQI'])

In [38]:
df = df.drop('City', 'Date', 'AQI')

In [39]:
indexer = StringIndexer(inputCol="AQI_Bucket", outputCol="label")
indexer_model = indexer.fit(df)
df = indexer_model.transform(df)

In [40]:
feature_columns = [col for col in df.columns if col not in ['AQI_Bucket', 'label', 'Xylene']]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features", handleInvalid='skip')
df = assembler.transform(df)

In [64]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [65]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

In [66]:
paramGrid = (ParamGridBuilder()
    .addGrid(rf.numTrees, [10, 50, 100])
    .addGrid(rf.maxDepth, [5, 10, 15])
    .addGrid(rf.maxBins, [32, 64])
    .addGrid(rf.impurity, ['gini', 'entropy'])
    .build())

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

cv = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3  # or 5 for more reliability
)

In [67]:
cvModel = cv.fit(train_data)

# Best model
best_model = cvModel.bestModel
print("Best numTrees:", best_model.getNumTrees)
print("Best maxDepth:", best_model.getOrDefault("maxDepth"))
print("Best impurity:", best_model.getOrDefault("impurity"))

25/10/25 15:21:12 WARN DAGScheduler: Broadcasting large task binary with size 1040.0 KiB
25/10/25 15:21:14 WARN DAGScheduler: Broadcasting large task binary with size 1123.3 KiB
25/10/25 15:21:17 WARN DAGScheduler: Broadcasting large task binary with size 1073.1 KiB
25/10/25 15:21:18 WARN DAGScheduler: Broadcasting large task binary with size 1040.0 KiB
25/10/25 15:21:18 WARN DAGScheduler: Broadcasting large task binary with size 1373.7 KiB
25/10/25 15:21:19 WARN DAGScheduler: Broadcasting large task binary with size 1699.3 KiB
25/10/25 15:21:19 WARN DAGScheduler: Broadcasting large task binary with size 1987.8 KiB
25/10/25 15:21:19 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/10/25 15:21:19 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
25/10/25 15:21:20 WARN DAGScheduler: Broadcasting large task binary with size 1601.9 KiB
25/10/25 15:21:21 WARN DAGScheduler: Broadcasting large task binary with size 1123.3 KiB
25/10/25 15:21:21 WARN DAGS

Best numTrees: 100
Best maxDepth: 15
Best impurity: gini


In [68]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, maxDepth=15, impurity='gini', weightCol='classWeightCol', seed=819)
best_model = rf.fit(train_data)

IllegalArgumentException: [FIELD_NOT_FOUND] No such struct field `classWeightCol` in `PM25`, `PM10`, `NO`, `NO2`, `NOx`, `NH3`, `CO`, `SO2`, `O3`, `Benzene`, `Toluene`, `Xylene`, `AQI_Bucket`, `label`, `features`. SQLSTATE: 42704

In [57]:
predictions = best_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy:", accuracy)

25/10/25 15:20:26 WARN DAGScheduler: Broadcasting large task binary with size 18.4 MiB


Test Accuracy: 0.8280718795531812


                                                                                

In [58]:
label_converter = IndexToString(inputCol="prediction", outputCol="predicted_AQI_Bucket", labels=indexer_model.labels)
predictions = label_converter.transform(predictions)

predictions.select("predicted_AQI_Bucket", "AQI_Bucket", "features").show()

25/10/25 15:20:30 WARN DAGScheduler: Broadcasting large task binary with size 18.4 MiB


+--------------------+------------+--------------------+
|predicted_AQI_Bucket|  AQI_Bucket|            features|
+--------------------+------------+--------------------+
|                Good|        Good|[1.25,9.1,11.7,0....|
|                Good|        Good|[2.0,17.99,1.29,1...|
|                Good|        Good|[2.33,6.33,11.68,...|
|                Good|        Good|[3.43,7.24,11.61,...|
|                Good|        Good|[3.93,6.85,11.51,...|
|                Good|        Good|[4.49,8.9,11.7,0....|
|        Satisfactory|Satisfactory|[5.08,8.16,13.55,...|
|                Good|        Good|[5.48,9.88,11.53,...|
|                Good|        Good|[6.15,8.99,13.16,...|
|                Good|        Good|[6.24,12.61,2.92,...|
|                Good|        Good|[6.3,17.76,1.09,1...|
|                Good|        Good|[6.32,11.92,11.56...|
|        Satisfactory|Satisfactory|[6.48,22.89,7.13,...|
|                Good|        Good|[6.56,8.7,0.29,0....|
|                Good|Satisfact

In [59]:
df_pred_year = spark.read.parquet('file:///home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/MiniProject/output/forecast_results/')

In [60]:
df_pred_year.show(5)

+---------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|     City|               Date|           Benzene|                CO|               NH3|               NO|               NO2|               NOx|                O3|              PM10|             PM25|              SO2|          Toluene|
+---------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+
|Hyderabad|2016-01-28 00:00:00|1.0161405821377438|1.0011925791875236|15.587850311767323|8.207206423064044| 19.73004805057345|21.038202849003085| 32.83710826900987| 110.6911918463562|72.03084284137776| 6.18497614854868| 5.74130553027497|
|Hyderabad|2016-01-29 00:00:00|0.9669563252322121|0.

In [61]:
feature_columns = [col for col in df_pred_year.columns if col not in ['City','Date']]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features", handleInvalid='skip')
df_pred_year = assembler.transform(df_pred_year)

In [62]:
predictions = best_model.transform(df_pred_year)
label_converter = IndexToString(inputCol="prediction", outputCol="predicted_AQI_Bucket", labels=indexer_model.labels)
predictions = label_converter.transform(predictions)

In [63]:
predictions.groupBy('predicted_AQI_Bucket').count().show()

25/10/25 15:20:35 WARN DAGScheduler: Broadcasting large task binary with size 18.4 MiB

+--------------------+-----+
|predicted_AQI_Bucket|count|
+--------------------+-----+
|                Good|  908|
|        Satisfactory| 5409|
|            Moderate|32704|
+--------------------+-----+



25/10/25 15:20:37 WARN DAGScheduler: Broadcasting large task binary with size 18.4 MiB
                                                                                

In [30]:
predictions.write.mode("overwrite").parquet("file:///home/karthikeya/Desktop/sem5/MIT_SEM5_BDA/MiniProject/output/aqi_forecast/")

25/10/25 15:11:02 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

In [31]:
spark.stop()