In [15]:
from google.colab import files
files.upload()   # Upload kaggle.json


Saving kaggle.json to kaggle (4).json


{'kaggle (4).json': b'{"username":"guru1613","key":"0ee88e15b9e2383756f6068cd9c0104f"}'}

In [16]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d rohanrao/air-quality-data-in-india
!unzip air-quality-data-in-india.zip


Dataset URL: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india
License(s): CC0-1.0
air-quality-data-in-india.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  air-quality-data-in-india.zip
replace city_day.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace city_hour.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace station_day.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace station_hour.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace stations.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: no


In [17]:
import pandas as pd
import numpy as np

df = pd.read_csv("city_day.csv")

# Clean NaN values
df = df.dropna(subset=['AQI', 'PM2.5', 'PM10'])

# Select important features
features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3']
df = df.dropna(subset=features)

X = df[features]
y = df['AQI']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)
print("R2 Score:", r2_score(y_test, preds))


In [None]:
joblib.dump(model, "aqi_model.pkl")
files.download("aqi_model.pkl")


In [None]:
import numpy as np
import matplotlib.pyplot as plt

importance = model.feature_importances_
feature_labels = X.columns

plt.figure(figsize=(8,5))
plt.bar(feature_labels, importance)
plt.title("Pollution Source Contribution")
plt.ylabel("Importance Score")
plt.show()


In [None]:
import pandas as pd

df = pd.read_csv("city_day.csv")

# Select only Delhi records
delhi = df[df['City'] == 'Delhi'].copy()

# Convert date to proper datetime
delhi['Date'] = pd.to_datetime(delhi['Date'])

# Keep columns needed for time-series forecast
delhi = delhi[['Date', 'AQI']].dropna()

delhi = delhi.sort_values('Date')
delhi.head()


In [26]:
delhi = delhi.set_index('Date')


In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(delhi[['AQI']])


In [28]:
import numpy as np

def create_sequences(data, seq_length):
    X = []
    y = []
    for i in range(seq_length, len(data)):
        X.append(data[i-seq_length:i])
        y.append(data[i])
    return np.array(X), np.array(y)

SEQ_LEN = 30  # Use last 30 days to predict next day
X, y = create_sequences(scaled_data, SEQ_LEN)

# Reshape for LSTM: (samples, time_steps, features)
X = X.reshape(X.shape[0], X.shape[1], 1)


In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model_lstm = Sequential([
    LSTM(64, return_sequences=True, input_shape=(SEQ_LEN, 1)),
    LSTM(32),
    Dense(1)
])

model_lstm.compile(optimizer='adam', loss='mse')
history = model_lstm.fit(X, y, epochs=15, batch_size=32, verbose=1)


  super().__init__(**kwargs)


Epoch 1/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - loss: 0.0393
Epoch 2/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0110
Epoch 3/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0111
Epoch 4/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0100
Epoch 5/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0096
Epoch 6/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0093
Epoch 7/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0079
Epoch 8/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0077
Epoch 9/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0069
Epoch 10/15
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0064
Epoch

In [30]:
last_30 = scaled_data[-SEQ_LEN:].reshape(1, SEQ_LEN, 1)
preds_scaled = model_lstm.predict(last_30)

predicted_aqi = scaler.inverse_transform(preds_scaled)[0][0]
print("Predicted AQI for tomorrow:", predicted_aqi)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
Predicted AQI for tomorrow: 117.962166


In [None]:
import joblib
model_lstm.save("delhi_lstm_forecast.h5")
joblib.dump(scaler, "aqi_scaler.pkl")

from google.colab import files
files.download("delhi_lstm_forecast.h5")
files.download("aqi_scaler.pkl")


In [31]:
!pip install statsmodels




In [32]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX

df = pd.read_csv("city_day.csv")
delhi = df[df['City']=="Delhi"].dropna(subset=["AQI"]).sort_values("Date")
delhi['Date'] = pd.to_datetime(delhi['Date'])
delhi.set_index('Date', inplace=True)

aqi_series = delhi['AQI']


In [33]:
model_sarimax = SARIMAX(aqi_series, order=(2,1,2), seasonal_order=(1,1,1,7))
model_sarimax_fit = model_sarimax.fit(disp=False)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [34]:
forecast = model_sarimax_fit.forecast(steps=3)
print("Next 3 days AQI forecast:")
print(forecast)


Next 3 days AQI forecast:
1999    115.687520
2000    118.638450
2001    117.849051
Name: predicted_mean, dtype: float64


  return get_prediction_index(
  return get_prediction_index(


In [35]:
import joblib
joblib.dump(model_sarimax_fit, "delhi_aqi_forecast_sarimax.pkl")
files.download("delhi_aqi_forecast_sarimax.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>