<a href="https://colab.research.google.com/github/Johny85/World-of-Scripts/blob/master/IISC_AQI_Project_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import glob

# Step 1: Read all the CSV files and combine them into a single DataFrame.
file_list = glob.glob('*.csv')

# Initialize an empty list to store the DataFrames.
df_list = []

# Loop through each file, read it into a DataFrame and add a 'City' column.
for file in file_list:
    city_name = file.replace('.csv', '')
    df = pd.read_csv(file)
    df['City'] = city_name
    df_list.append(df)

# Concatenate all the DataFrames into one.
combined_df = pd.concat(df_list, ignore_index=True)

# Step 2: Standardize column names.
new_columns = {
    'PM2.5 (µg/m³)': 'pm2_5',
    'PM10 (µg/m³)': 'pm10',
    'NO (µg/m³)': 'no',
    'NO2 (µg/m³)': 'no2',
    'NOx (ppb)': 'nox',
    'NH3 (µg/m³)': 'nh3',
    'SO2 (µg/m³)': 'so2',
    'CO (mg/m³)': 'co',
    'Ozone (µg/m³)': 'ozone',
    'Benzene (µg/m³)': 'benzene',
    'Toluene (µg/m³)': 'toluene',
    'Xylene (µg/m³)': 'xylene',
    'O Xylene (µg/m³)': 'o_xylene',
    'Eth-Benzene (µg/m³)': 'eth_benzene',
    'MP-Xylene (µg/m³)': 'mp_xylene',
    'AT (°C)': 'temp_c',
    'RH (%)': 'rh_percent',
    'WS (m/s)': 'ws_m_s',
    'WD (deg)': 'wd_deg',
    'RF (mm)': 'rf_mm',
    'TOT-RF (mm)': 'tot_rf_mm',
    'SR (W/mt2)': 'sr_w_mt2',
    'BP (mmHg)': 'bp_mmHg',
    'VWS (m/s)': 'vws_m_s',
    'Timestamp': 'timestamp'
}
combined_df.rename(columns=new_columns, inplace=True)

# Step 3: Handle missing values and convert data types.
# Replace common representations of missing data with pandas' NaN.
combined_df.replace(['', 'NA'], pd.NA, inplace=True)

# Convert 'timestamp' to datetime and set it as the index.
# Use errors='coerce' to turn invalid parsing into NaT (Not a Time).
# Use format='mixed' and dayfirst=True to handle potential inconsistencies in timestamp format.
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'], errors='coerce', format='mixed', dayfirst=True)
combined_df.set_index('timestamp', inplace=True)

# Convert all relevant columns to numeric type, coercing errors to NaN.
for col in combined_df.columns:
    if col not in ['City']:
        combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

# Step 4: Check for and drop duplicates.
duplicates = combined_df.duplicated().sum()
print(f"Found {duplicates} duplicate rows.")
if duplicates > 0:
    combined_df.drop_duplicates(inplace=True)
    print("Dropped duplicate rows.")


combined_df.info()

# Step 5: Resample the data to a daily average.
# Group by city and then resample
daily_df = combined_df.groupby('City').resample('D').mean()

# The grouping creates a multi-index. Reset the index to make 'City' a column again.
daily_df.reset_index(inplace=True)

# You can save this cleaned and resampled data to a new CSV file.
daily_df.to_csv('aqi_data.csv', index=False)

print("\n--- Preprocessing Complete ---")
print("Head of the cleaned and resampled DataFrame:")
print(daily_df.head())
print("\nDataFrame Info:")
daily_df.info()