<a href="https://colab.research.google.com/github/Johny85/World-of-Scripts/blob/master/IISC_AQI_Project_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import glob

# Step 1: Read all the CSV files and combine them into a single DataFrame.
file_list = glob.glob('*.csv')

# Initialize an empty list to store the DataFrames.
df_list = []

# Loop through each file, read it into a DataFrame and add a 'City' column.
for file in file_list:
    city_name = file.replace('.csv', '')
    df = pd.read_csv(file)
    df['City'] = city_name
    df_list.append(df)

# Concatenate all the DataFrames into one.
combined_df = pd.concat(df_list, ignore_index=True)

# Step 2: Standardize column names.
new_columns = {
    'PM2.5 (µg/m³)': 'pm2_5',
    'PM10 (µg/m³)': 'pm10',
    'NO (µg/m³)': 'no',
    'NO2 (µg/m³)': 'no2',
    'NOx (ppb)': 'nox',
    'NH3 (µg/m³)': 'nh3',
    'SO2 (µg/m³)': 'so2',
    'CO (mg/m³)': 'co',
    'Ozone (µg/m³)': 'ozone',
    'Benzene (µg/m³)': 'benzene',
    'Toluene (µg/m³)': 'toluene',
    'Xylene (µg/m³)': 'xylene',
    'O Xylene (µg/m³)': 'o_xylene',
    'Eth-Benzene (µg/m³)': 'eth_benzene',
    'MP-Xylene (µg/m³)': 'mp_xylene',
    'AT (°C)': 'temp_c',
    'RH (%)': 'rh_percent',
    'WS (m/s)': 'ws_m_s',
    'WD (deg)': 'wd_deg',
    'RF (mm)': 'rf_mm',
    'TOT-RF (mm)': 'tot_rf_mm',
    'SR (W/mt2)': 'sr_w_mt2',
    'BP (mmHg)': 'bp_mmHg',
    'VWS (m/s)': 'vws_m_s',
    'Timestamp': 'timestamp'
}
combined_df.rename(columns=new_columns, inplace=True)

# Step 3: Handle missing values and convert data types.
# Replace common representations of missing data with pandas' NaN.
combined_df.replace(['', 'NA'], pd.NA, inplace=True)

# Convert 'timestamp' to datetime and set it as the index.
# Use errors='coerce' to turn invalid parsing into NaT (Not a Time).
# Use format='mixed' and dayfirst=True to handle potential inconsistencies in timestamp format.
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'], errors='coerce', format='mixed', dayfirst=True)
combined_df.set_index('timestamp', inplace=True)

# Convert all relevant columns to numeric type, coercing errors to NaN.
for col in combined_df.columns:
    if col not in ['City']:
        combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

# Step 4: Check for and drop duplicates.
duplicates = combined_df.duplicated().sum()
print(f"Found {duplicates} duplicate rows.")
if duplicates > 0:
    combined_df.drop_duplicates(inplace=True)
    print("Dropped duplicate rows.")


combined_df.info()

# Step 5: Resample the data to a daily average.
# Group by city and then resample
daily_df = combined_df.groupby('City').resample('D').mean()

# The grouping creates a multi-index. Reset the index to make 'City' a column again.
daily_df.reset_index(inplace=True)

# You can save this cleaned and resampled data to a new CSV file.
daily_df.to_csv('aqi_data.csv', index=False)

print("\n--- Preprocessing Complete ---")
print("Head of the cleaned and resampled DataFrame:")
print(daily_df.head())
print("\nDataFrame Info:")
daily_df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

# Re-load the cleaned data if you are starting a new notebook session
# daily_df = pd.read_csv('aqi_data.csv', index_col='timestamp', parse_dates=True)

print(daily_df.columns)
print(combined_df.columns)

# --- Visualization 1: Time-series Plot for PM2.5 ---
plt.figure(figsize=(15, 10))
sns.lineplot(data=daily_df, x=daily_df.index, y='pm2_5')
plt.title('PM2.5 Levels Over Time (Daily Average)')
plt.xlabel('Date')
plt.ylabel('PM2.5 (µg/m³)')
plt.grid(True)
plt.legend()
plt.show()
plt.savefig('Time-series Plot for PM2.5.png')

# --- Visualization 2: Seasonal Boxplots ---
daily_df.index = pd.to_datetime(daily_df.index)
daily_df['month'] = daily_df.index.month
plt.figure(figsize=(15, 10))
sns.boxplot(x='month', y='pm2_5', data=daily_df)
plt.title('Monthly PM2.5 Levels')
plt.xlabel('Month')
plt.ylabel('PM2.5 (µg/m³)')
plt.grid(True)
plt.legend()
plt.show()
plt.savefig('Seasonal_Boxplots.png')

# --- Visualization 3: Correlation Heatmap ---
pollutants = ['pm2_5', 'pm10', 'no2', 'so2', 'ozone', 'co']
meteorological = ['temp_c', 'rh_percent', 'ws_m_s', 'rf_mm']
all_cols = pollutants + meteorological
correlation_matrix = daily_df[all_cols].corr(method='pearson')
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Pollutants and Meteorological Drivers')
plt.legend()
plt.show()
plt.savefig('Correlation Heatmap.png')

# --- Seasonal Decomposition ---
city_data = daily_df[daily_df['City'] == 'Amravati']['pm2_5'].dropna()
decomposition = seasonal_decompose(city_data, model='additive', period=365)
decomposition.plot()
plt.figure(figsize=(15, 10))
plt.suptitle('Seasonal Decomposition of PM2.5 for Amravati', y=1.02)
plt.legend()
plt.show()
plt.savefig('Seasonal Decomposition.png')

In [None]:
from scipy.stats import pearsonr, spearmanr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import visualkeras

# --- Part 1: Correlation Analysis ---
# Example for PM2.5 and temperature
corr_data = daily_df[['pm2_5', 'temp_c']].dropna()
pm25_temp_pearson = pearsonr(corr_data['pm2_5'], corr_data['temp_c'])
pm25_temp_spearman = spearmanr(corr_data['pm2_5'], corr_data['temp_c'])
print(f"Pearson Correlation (PM2.5 vs Temp): r={pm25_temp_pearson.statistic:.2f}, p-value={pm25_temp_pearson.pvalue:.2e}")
print(f"Spearman Correlation (PM2.5 vs Temp): rho={pm25_temp_spearman.statistic:.2f}, p-value={pm25_temp_spearman.pvalue:.2e}")

# --- Part 2: Regression Analysis (PM2.5 vs Meteorological Drivers) ---
features = ['temp_c', 'rh_percent', 'ws_m_s', 'rf_mm']
target = 'pm2_5'
regression_data = daily_df.dropna(subset=features + [target])
X = regression_data[features]
y = regression_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\nLinear Regression Model Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}") # This is your % variance explained!

# --- Part 3: ANN for 7-day Prediction (conceptual outline) ---
# This only shows a basic ANN setup.
# Scaling the data before feeding it to the ANN.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the ANN model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1)) # Output layer for a single value prediction

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=0)
print("\nANN Model Training Complete.")

visualkeras.layered_view(model).show() # display using your system viewer