In [8]:
# -------------------------------
# Air Quality Dataset Preprocessing
# -------------------------------

import pandas as pd
import numpy as np

# Load the dataset
file_path = r"air_quality_dataset.csv"  # <-- Change path if needed
df = pd.read_csv(file_path)

print("Initial Data Overview:")
print(df.info())
print("\nFirst 5 rows of data:")
print(df.head())

# -------------------------------
# Step 1: Handle Date Columns
# -------------------------------
if 'DATEOFF' in df.columns:
    df['DATEOFF'] = pd.to_datetime(df['DATEOFF'], errors='coerce')
if 'DATEON' in df.columns:
    df['DATEON'] = pd.to_datetime(df['DATEON'], errors='coerce')

# -------------------------------
# Step 2: Handle Missing Values
# -------------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(exclude=[np.number, 'datetime']).columns

# Fill numeric columns with their mean
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

# Fill categorical columns with their mode
for col in categorical_cols:
    if df[col].notna().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# -------------------------------
# Step 3: Remove Duplicates
# -------------------------------
df = df.drop_duplicates()

# -------------------------------
# Step 4: Standardize Column Names
# -------------------------------
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# -------------------------------
# Step 5: Save Cleaned Data
# -------------------------------
cleaned_file_path = r"C:\Users\vrmhc\Downloads\cleaned_air_quality_dataset.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"\nData cleaning completed! Cleaned file saved as: {cleaned_file_path}")

# -------------------------------
# Step 6: Summary Statistics
# -------------------------------
print("\nSummary Statistics:")
print(df.describe(include='all'))


Initial Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2103 entries, 0 to 2102
Data columns (total 18 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   SITE_ID   2103 non-null   object
 1   Week      2103 non-null   int64 
 2   Year      2103 non-null   int64 
 3   DATEOFF   2103 non-null   object
 4   Ca        2103 non-null   object
 5   Cl        2103 non-null   object
 6   HNO3      2103 non-null   object
 7   HNO3 PPB  2103 non-null   object
 8   K         2103 non-null   object
 9   Mg        2103 non-null   object
 10  Na        2103 non-null   object
 11  NH4       2103 non-null   object
 12  NO3       2103 non-null   object
 13  SO2       2103 non-null   object
 14  SO2 PPB   2103 non-null   object
 15  SO4       2103 non-null   object
 16  TNO3      2103 non-null   object
 17  DATEON    2103 non-null   object
dtypes: int64(2), object(16)
memory usage: 295.9+ KB
None

First 5 rows of data:
  SITE_ID  Week  Year        