# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from utils import load_data, get_season

In [25]:
df = pd.read_csv('city_day.csv', parse_dates=['Date'])

In [26]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   City        29531 non-null  object        
 1   Date        29531 non-null  datetime64[ns]
 2   PM2.5       24933 non-null  float64       
 3   PM10        18391 non-null  float64       
 4   NO          25949 non-null  float64       
 5   NO2         25946 non-null  float64       
 6   NOx         25346 non-null  float64       
 7   NH3         19203 non-null  float64       
 8   CO          27472 non-null  float64       
 9   SO2         25677 non-null  float64       
 10  O3          25509 non-null  float64       
 11  Benzene     23908 non-null  float64       
 12  Toluene     21490 non-null  float64       
 13  Xylene      11422 non-null  float64       
 14  AQI         24850 non-null  float64       
 15  AQI_Bucket  24850 non-null  object        
dtypes: datetime64[ns](1), 

In [29]:
missing_percentage = df.isnull().sum() / len(df) * 100
print("Percentage of missing values:\n", missing_percentage.sort_values(ascending=False))

Percentage of missing values:
 Xylene        61.322001
PM10          37.723071
NH3           34.973418
Toluene       27.229014
Benzene       19.041008
AQI           15.851139
AQI_Bucket    15.851139
PM2.5         15.570079
NOx           14.171549
O3            13.619586
SO2           13.050692
NO2           12.139785
NO            12.129626
CO             6.972334
Date           0.000000
City           0.000000
dtype: float64


In [30]:
df.drop('Xylene', axis=1, inplace=True)

In [31]:
numeric_cols = df.select_dtypes(include=np.number).columns
imputer = KNNImputer(n_neighbors=5)

In [32]:
numeric_cols

Index(['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
       'Benzene', 'Toluene', 'AQI'],
      dtype='object')

In [33]:
imputer

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False


In [34]:
df_imputed_list = []
for city in df['City'].unique():
    city_df = df[df['City'] == city].copy()
    
    # Work on a copy of the numeric part of the dataframe
    numeric_df_part = city_df[numeric_cols]

    # Check if there are any missing values at all for this city's numeric columns
    if numeric_df_part.isnull().values.any():
        # Identify columns that are not entirely NaN to avoid imputer shape mismatches
        cols_to_impute = numeric_df_part.columns[numeric_df_part.notna().any()].tolist()
        
        # Identify columns that are all NaN, to handle them separately
        cols_all_nan = numeric_df_part.columns[numeric_df_part.isna().all()].tolist()

        # Impute the columns that have some data
        if cols_to_impute:
            imputed_data = imputer.fit_transform(city_df[cols_to_impute])
            imputed_subset_df = pd.DataFrame(imputed_data, columns=cols_to_impute, index=city_df.index)
            city_df.update(imputed_subset_df)

        # For columns that were entirely NaN for this city, fill with the global median
        for col in cols_all_nan:
            global_median = df[col].median()
            city_df[col].fillna(global_median, inplace=True)
            
    df_imputed_list.append(city_df)

df_imputed = pd.concat(df_imputed_list)

print("Imputed missing values using KNNImputer and global medians for all-NaN columns.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  city_df[col].fillna(global_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  city_df[col].fillna(global_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

Imputed missing values using KNNImputer and global medians for all-NaN columns.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  city_df[col].fillna(global_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  city_df[col].fillna(global_median, inplace=True)


In [35]:
df_imputed

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,33.330,105.292,0.92,18.22,17.15,15.85,0.92,27.64,133.36,0.000,0.020,203.0,
1,Ahmedabad,2015-01-02,71.760,71.202,0.97,15.69,16.46,15.85,0.97,24.55,34.06,3.680,5.500,174.2,
2,Ahmedabad,2015-01-03,121.276,113.518,17.40,19.30,29.70,15.85,17.40,29.07,30.70,6.800,16.400,363.6,
3,Ahmedabad,2015-01-04,63.986,71.476,1.70,18.48,17.97,15.85,1.70,18.59,36.08,4.430,10.140,142.2,
4,Ahmedabad,2015-01-05,170.040,114.730,22.10,21.42,37.76,15.85,22.10,39.33,39.31,7.010,18.890,551.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.020,50.940,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.240,12.070,41.0,Good
29527,Visakhapatnam,2020-06-28,24.380,74.090,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.740,2.210,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.910,65.730,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.010,0.010,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.640,49.970,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.000,0.000,54.0,Satisfactory


In [37]:
# Handle missing 'AQI_Bucket' by forward fill and then backward fill within each city group
df_imputed['AQI_Bucket'] = df_imputed.groupby('City')['AQI_Bucket'].transform(lambda x: x.ffill().bfill())
# If any city still has all NaN AQI_Bucket, those rows will be dropped
df_imputed.dropna(subset=['AQI_Bucket'], inplace=True)

print("Missing values in 'AQI_Bucket' handled.")
print("\nTotal missing values remaining:", df_imputed.isnull().sum().sum())

Missing values in 'AQI_Bucket' handled.

Total missing values remaining: 0


In [38]:
df_imputed

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,33.330,105.292,0.92,18.22,17.15,15.85,0.92,27.64,133.36,0.000,0.020,203.0,Poor
1,Ahmedabad,2015-01-02,71.760,71.202,0.97,15.69,16.46,15.85,0.97,24.55,34.06,3.680,5.500,174.2,Poor
2,Ahmedabad,2015-01-03,121.276,113.518,17.40,19.30,29.70,15.85,17.40,29.07,30.70,6.800,16.400,363.6,Poor
3,Ahmedabad,2015-01-04,63.986,71.476,1.70,18.48,17.97,15.85,1.70,18.59,36.08,4.430,10.140,142.2,Poor
4,Ahmedabad,2015-01-05,170.040,114.730,22.10,21.42,37.76,15.85,22.10,39.33,39.31,7.010,18.890,551.6,Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.020,50.940,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.240,12.070,41.0,Good
29527,Visakhapatnam,2020-06-28,24.380,74.090,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.740,2.210,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.910,65.730,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.010,0.010,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.640,49.970,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.000,0.000,54.0,Satisfactory


In [39]:
df_featured = df_imputed.copy()

In [40]:
df_featured['Month'] = df_featured['Date'].dt.month
df_featured['DayofWeek'] = df_featured['Date'].dt.dayofweek
df_featured['Is_Weekend'] = df_featured['DayofWeek'].isin([5, 6]).astype(int)

# Seasonal feature
df_featured['Season'] = df_featured['Month'].apply(get_season)

In [41]:
epsilon = 1e-6
df_featured['PM2.5_to_PM10_Ratio'] = df_featured['PM2.5'] / (df_featured['PM10'] + epsilon)

print("Created new features: 'Month', 'DayofWeek', 'Is_Weekend', 'Season', 'PM2.5_to_PM10_Ratio'.")
df_featured.head()

Created new features: 'Month', 'DayofWeek', 'Is_Weekend', 'Season', 'PM2.5_to_PM10_Ratio'.


Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,AQI_Bucket,Month,DayofWeek,Is_Weekend,Season,PM2.5_to_PM10_Ratio
0,Ahmedabad,2015-01-01,33.33,105.292,0.92,18.22,17.15,15.85,0.92,27.64,133.36,0.0,0.02,203.0,Poor,1,3,0,Winter,0.316548
1,Ahmedabad,2015-01-02,71.76,71.202,0.97,15.69,16.46,15.85,0.97,24.55,34.06,3.68,5.5,174.2,Poor,1,4,0,Winter,1.007837
2,Ahmedabad,2015-01-03,121.276,113.518,17.4,19.3,29.7,15.85,17.4,29.07,30.7,6.8,16.4,363.6,Poor,1,5,1,Winter,1.068342
3,Ahmedabad,2015-01-04,63.986,71.476,1.7,18.48,17.97,15.85,1.7,18.59,36.08,4.43,10.14,142.2,Poor,1,6,1,Winter,0.89521
4,Ahmedabad,2015-01-05,170.04,114.73,22.1,21.42,37.76,15.85,22.1,39.33,39.31,7.01,18.89,551.6,Poor,1,0,0,Winter,1.482088


In [42]:
df_featured.to_csv('df_processed_1.csv', index=False)
print("Saved the processed data to 'df_processed_1.csv'")

Saved the processed data to 'df_processed_1.csv'
