# 🌍 Real-Time Air Quality Prediction System
This project analyzes real-time air quality data and builds a machine learning model to predict pollution levels.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

## 1. Load Dataset

In [2]:
file_path = "csv_result-air_quality_realtime_large (1).csv"
df = pd.read_csv(file_path)
df.head()

## 2. Data Overview

In [3]:
print("Shape of dataset:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nData types:\n", df.dtypes)
df.describe(include='all')

## 3. Data Cleaning

In [4]:
# Handle missing values
df = df.dropna()

# Convert categorical columns if any
for col in df.select_dtypes(include=["object"]):
    df[col] = LabelEncoder().fit_transform(df[col])

df.head()

## 4. Exploratory Data Analysis (EDA)

In [5]:
# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=False, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [6]:
# Example distribution plot
plt.figure(figsize=(8,5))
sns.histplot(df[df.columns[0]], bins=30, kde=True)
plt.title("Distribution of First Feature")
plt.show()

## 5. Define Features & Target

In [7]:
# Assuming last column is target (update if needed)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

## 6. Train Machine Learning Model

In [8]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

## 7. Save Trained Model

In [9]:
with open("air_quality_model.pkl", "wb") as f:
    pickle.dump(model, f)
print("Model saved as air_quality_model.pkl")