<a href="https://colab.research.google.com/github/Harivamsh2005/AIML-lab/blob/main/2303A51266_(SET_A)__Q12_Batch_no_19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load the dataset
data_path = 'AirQualityUCI[1].csv'  # Replace with your dataset path
air_quality_data = pd.read_csv(data_path, sep=';', decimal=',', na_values=-200)

# Data Cleaning
air_quality_data = air_quality_data.drop(columns=['Unnamed: 15', 'Unnamed: 16'], errors='ignore')
air_quality_data['DateTime'] = pd.to_datetime(
    air_quality_data['Date'] + ' ' + air_quality_data['Time'],
    format='%d/%m/%Y %H.%M.%S',
    errors='coerce'
)
air_quality_data = air_quality_data.dropna(subset=['DateTime'])
air_quality_data = air_quality_data.drop(columns=['Date', 'Time'])
air_quality_data = air_quality_data.fillna(air_quality_data.mean(numeric_only=True))

# 1. Top 5 reasons for air quality issues
top_pollutants = air_quality_data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)', 'NMHC(GT)']].mean().sort_values(ascending=False)
print("Top 5 Pollutants Contributing to Air Quality Issues:")
print(top_pollutants.head(5))

# 2. Day of the week with most air quality issues
air_quality_data['DayOfWeek'] = air_quality_data['DateTime'].dt.day_name()
weekday_issues = air_quality_data.groupby('DayOfWeek')[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']].mean().mean(axis=1)
print("\nDay of the Week with Most Air Quality Issues:", weekday_issues.idxmax())

# 3. Max and min air quality levels
max_levels = air_quality_data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']].max()
min_levels = air_quality_data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)']].min()
print("\nMaximum Air Quality Levels:")
print(max_levels)
print("\nMinimum Air Quality Levels:")
print(min_levels)

# 4. Highest and lowest temperatures
highest_temp = air_quality_data['T'].max()
lowest_temp = air_quality_data['T'].min()
print("\nHighest Temperature:", highest_temp)
print("Lowest Temperature:", lowest_temp)

# 5. Note on educational qualification of employees
print("\nThe dataset doesn't include employee details or qualifications.")

# 6. Clustering Model
features = air_quality_data[['CO(GT)', 'C6H6(GT)', 'NOx(GT)', 'NO2(GT)', 'T', 'RH', 'AH']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

air_quality_data['Cluster'] = clusters
print("\nClustering Analysis Results:")
print(air_quality_data[['DateTime', 'Cluster']].head())


Top 5 Pollutants Contributing to Air Quality Issues:
NOx(GT)     246.896735
NMHC(GT)    218.811816
NO2(GT)     113.091251
C6H6(GT)     10.083105
CO(GT)        2.152750
dtype: float64

Day of the Week with Most Air Quality Issues: Friday

Maximum Air Quality Levels:
CO(GT)        11.9
C6H6(GT)      63.7
NOx(GT)     1479.0
NO2(GT)      340.0
dtype: float64

Minimum Air Quality Levels:
CO(GT)      0.1
C6H6(GT)    0.1
NOx(GT)     2.0
NO2(GT)     2.0
dtype: float64

Highest Temperature: 44.6
Lowest Temperature: -1.9

The dataset doesn't include employee details or qualifications.

Clustering Analysis Results:
             DateTime  Cluster
0 2004-03-10 18:00:00        2
1 2004-03-10 19:00:00        2
2 2004-03-10 20:00:00        2
3 2004-03-10 21:00:00        2
4 2004-03-10 22:00:00        2
