In [1]:
import os
print(os.getcwd())

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Expanded dataset with major Indian cities
data = {
    'City': [
        'Delhi','Mumbai','Chennai','Kolkata','Bangalore','Hyderabad','Pune','Ahmedabad',
        'Jaipur','Lucknow','Kanpur','Nagpur','Visakhapatnam','Indore','Patna','Bhopal',
        'Agra','Varanasi','Amritsar','Chandigarh','Coimbatore','Guwahati','Ranchi',
        'Raipur','Surat','Vadodara','Ludhiana','Nashik','Mangalore','Mysore'
    ],
    'Date': ['2024-01-01']*30,
    'PM2.5': [153, 70, 174, 477, 171, 132, 88, 119, 140, 160, 172, 130, 98, 122, 190, 111,
              175, 210, 199, 105, 82, 144, 155, 166, 100, 115, 123, 90, 78, 65],
    'PM10': [242, 313, 275, 544, 118, 221, 180, 260, 245, 280, 299, 240, 205, 230, 320, 210,
              275, 350, 300, 190, 155, 245, 260, 275, 200, 225, 210, 175, 165, 145],
    'NO2': [33, 42, 69, 76, 12, 30, 25, 40, 38, 44, 50, 29, 33, 39, 60, 27,
            58, 72, 66, 22, 19, 40, 42, 44, 31, 35, 38, 28, 20, 18],
    'CO': [1.8, 7.2, 8.5, 2.4, 1.2, 3.0, 2.8, 4.0, 3.6, 3.9, 4.1, 2.5, 2.7, 3.2, 5.5, 2.2,
           5.0, 6.0, 5.8, 2.1, 1.9, 3.8, 4.0, 4.2, 2.9, 3.3, 3.5, 2.4, 2.0, 1.7],
    'AQI': [205, 61, 486, 174, 490, 301, 195, 220, 245, 280, 310, 230, 200, 225, 390, 240,
            310, 420, 400, 190, 160, 250, 270, 290, 210, 235, 240, 180, 170, 150],
    'AQI_Bucket': [
        'Severe','Satisfactory','Severe','Very Poor','Good','Poor','Moderate','Poor',
        'Poor','Very Poor','Very Poor','Poor','Moderate','Poor','Severe','Poor',
        'Very Poor','Severe','Severe','Moderate','Satisfactory','Poor','Poor','Poor',
        'Moderate','Poor','Poor','Moderate','Moderate','Satisfactory'
    ]
}

df = pd.DataFrame(data)
print(df.head())
print("\nTotal cities in dataset:", len(df))

In [4]:
# Quick summary
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
print("\nBasic Statistics:\n", df.describe())

In [5]:
# Visualization: PM2.5 by City
df.plot(x='City', y='PM2.5', kind='bar', title='PM2.5 Levels by City', color='skyblue', figsize=(12,6))
plt.ylabel('PM2.5 Level')
plt.xticks(rotation=90)
plt.show()

# Correlation heatmap
plt.figure(figsize=(6,4))
sns.heatmap(df[['PM2.5','PM10','NO2','CO','AQI']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [6]:
# Model Training: Predict AQI using pollutants
X = df[['PM2.5','PM10','NO2','CO']]
y = df['AQI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))

In [7]:
# Prediction Example (New data)
new_data = pd.DataFrame({
    'PM2.5':[120, 300],
    'PM10':[200, 500],
    'NO2':[40, 80],
    'CO':[2.0, 5.0]
})

predictions = model.predict(new_data)
print("Predicted AQI for new samples:", predictions)