In [29]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import os

In [30]:
data = pd.read_csv(r'filtered_pollution_us_2000_2016.csv')

In [31]:
# Display first few rows
data.head()

Unnamed: 0,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
0,19.041667,49.0,19,46,0.0225,0.04,10,34,3.0,9.0,21,13.0,1.145833,4.2,21,
1,19.041667,49.0,19,46,0.0225,0.04,10,34,3.0,9.0,21,13.0,0.878947,2.2,23,25.0
2,19.041667,49.0,19,46,0.0225,0.04,10,34,2.975,6.6,23,,1.145833,4.2,21,
3,19.041667,49.0,19,46,0.0225,0.04,10,34,2.975,6.6,23,,0.878947,2.2,23,25.0
4,22.958333,36.0,19,34,0.013375,0.032,10,27,1.958333,3.0,22,4.0,0.85,1.6,23,


In [32]:
# Display columns
data.columns

Index(['NO2 Mean', 'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI',
       'O3 Mean', 'O3 1st Max Value', 'O3 1st Max Hour', 'O3 AQI', 'SO2 Mean',
       'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI', 'CO Mean',
       'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI'],
      dtype='object')

In [33]:
#Check for missing values
data.isnull().sum() #checking to see how many missing values exist per column

NO2 Mean                  0
NO2 1st Max Value         0
NO2 1st Max Hour          0
NO2 AQI                   0
O3 Mean                   0
O3 1st Max Value          0
O3 1st Max Hour           0
O3 AQI                    0
SO2 Mean                  0
SO2 1st Max Value         0
SO2 1st Max Hour          0
SO2 AQI              872907
CO Mean                   0
CO 1st Max Value          0
CO 1st Max Hour           0
CO AQI               873323
dtype: int64

In [34]:
# Checking for duplicate rows and dropping
print(data.duplicated().sum())
data.drop_duplicates(inplace = True)

5110


In [35]:
# Check data types & convert if necessary
data.dtypes

NO2 Mean             float64
NO2 1st Max Value    float64
NO2 1st Max Hour       int64
NO2 AQI                int64
O3 Mean              float64
O3 1st Max Value     float64
O3 1st Max Hour        int64
O3 AQI                 int64
SO2 Mean             float64
SO2 1st Max Value    float64
SO2 1st Max Hour       int64
SO2 AQI              float64
CO Mean              float64
CO 1st Max Value     float64
CO 1st Max Hour        int64
CO AQI               float64
dtype: object

In [36]:
# Summary stats
data.describe()

# Which pollutants have the highest variability?
# Are AQI values within expected ranges?

Unnamed: 0,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
count,1741551.0,1741551.0,1741551.0,1741551.0,1741551.0,1741551.0,1741551.0,1741551.0,1741551.0,1741551.0,1741551.0,871246.0,1741551.0,1741551.0,1741551.0,870601.0
mean,12.82216,25.41104,11.72805,23.89448,0.02612029,0.03919736,10.16793,36.03702,1.872076,4.496082,9.66308,7.121016,0.3683035,0.6202876,7.872505,5.995569
std,9.509228,15.99961,7.876593,15.16131,0.01136707,0.01533377,4.001878,19.74295,2.762553,7.685473,6.729217,11.94299,0.3139541,0.6438729,7.977696,5.845211
min,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0,0.0,0.0,-0.4375,-0.4,0.0,0.0
25%,5.75,13.0,5.0,12.0,0.017875,0.029,9.0,25.0,0.257143,0.9,5.0,1.0,0.183333,0.292,0.0,2.0
50%,10.73913,24.0,9.0,23.0,0.025875,0.038,10.0,33.0,0.9875,2.0,8.0,3.0,0.293783,0.4,6.0,5.0
75%,17.71429,35.6,20.0,33.0,0.0339,0.048,11.0,42.0,2.328571,5.0,14.0,9.0,0.466667,0.8,13.0,8.0
max,139.5417,267.0,23.0,132.0,0.095083,0.141,23.0,218.0,321.625,351.0,23.0,200.0,7.508333,19.9,23.0,201.0


In [37]:
# Define the columns to check
columns = ['NO2 AQI', 'O3 AQI', 'CO AQI', 'SO2 AQI']

# Filter rows where any of the specified columns have values between 100 and 200
rows_in_range = data[(data[columns] >= 100).any(axis=1) & (data[columns] <= 200).any(axis=1)]

# Count the number of rows
count = len(rows_in_range)

print(f"Number of rows where values in any of {columns} are between 100 and 200: {count}")

Number of rows where values in any of ['NO2 AQI', 'O3 AQI', 'CO AQI', 'SO2 AQI'] are between 100 and 200: 42701


In [38]:
# Before checking threshold for features
number_rows, number_columns = data.shape
print(f"Raw Data: {number_rows} samples and {number_columns} features")


# Removing Any Feature with >= 80% Missing Values
threshold = 0.8
retain_columns = [col for col in data.columns if data[col].isnull().mean() < threshold]
data_cleaning = data[retain_columns]
print(f"Features Retained After Removing >= 80% Missing Values: {len(retain_columns)}")

Raw Data: 1741551 samples and 16 features
Features Retained After Removing >= 80% Missing Values: 16


In [39]:
# Solving the NaN values for SO2 AQI

# Split Data where there are NaN and Non-NaN values
features = ["SO2 Mean", "SO2 1st Max Value", "SO2 1st Max Hour"]
so2_known = data.dropna(subset=["SO2 AQI"])
so2_unknown = data[data["SO2 AQI"].isna()]
print(so2_unknown[so2_unknown["SO2 AQI"].isna()])
print(so2_unknown.columns)

# Using Linear Regression to fill in where NaN exists
so2_model = LinearRegression().fit(so2_known[features], so2_known["SO2 AQI"])
data.loc[data["SO2 AQI"].isna(), "SO2 AQI"] = so2_model.predict(so2_unknown[features])

          NO2 Mean  NO2 1st Max Value  NO2 1st Max Hour  NO2 AQI   O3 Mean  \
2        19.041667               49.0                19       46  0.022500   
3        19.041667               49.0                19       46  0.022500   
6        22.958333               36.0                19       34  0.013375   
7        22.958333               36.0                19       34  0.013375   
10       38.125000               51.0                 8       48  0.007958   
...            ...                ...               ...      ...       ...   
1746652   2.564706                3.6                 6        3  0.028000   
1746655   1.083333                1.6                 9        1  0.043917   
1746656   1.083333                1.6                 9        1  0.043917   
1746659   0.939130                1.3                 5        1  0.045263   
1746660   0.939130                1.3                 5        1  0.045263   

         O3 1st Max Value  O3 1st Max Hour  O3 AQI  SO2 Mean  \

In [40]:
# Solving the NaN values for CO AQI

# Split Data where there are NaN and Non-NaN values
features = ["CO Mean", "CO 1st Max Value", "CO 1st Max Hour"]
co_known = data.dropna(subset=["CO AQI"])
co_unknown = data[data["CO AQI"].isna()]

# Using Linear Regression to fill in where NaN exists
co_model = LinearRegression().fit(co_known[features], co_known["CO AQI"])
data.loc[data["CO AQI"].isna(), "CO AQI"] = co_model.predict(co_unknown[features])


In [41]:
no2_zero = (data["NO2 AQI"] <= 0).sum()
o3_zero = (data["O3 AQI"] <= 0).sum()
so2_zero = (data["SO2 AQI"] <= 0).sum()
co_zero = (data["CO AQI"] <= 0).sum()
print(f"NO2: {no2_zero}, \n O3: {o3_zero}, \n SO2: {so2_zero}, \n CO: {co_zero}")

NO2: 12626, 
 O3: 596, 
 SO2: 273421, 
 CO: 58580


In [42]:
# Elimiinating any negative values from the data set
data_cleaned = data[(data["NO2 AQI"] > 0) & 
            (data["O3 AQI"] > 0) & 
            (data["SO2 AQI"] > 0) & 
            (data["CO AQI"] > 0) &
            (data["NO2 Mean"] > 0) &
            (data["SO2 Mean"] > 0) &
            (data["SO2 1st Max Value"] > 0) &
            (data["CO Mean"] > 0)]

len(data_cleaned)

1406162

In [43]:
# Calculate the overall AQI as the maximum value among the individual AQI columns (NO2, O3, SO2, CO) for each row
data_cleaned["Overall AQI"] = data_cleaned[['NO2 AQI', 'O3 AQI', 'SO2 AQI', 'CO AQI']].max(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned["Overall AQI"] = data_cleaned[['NO2 AQI', 'O3 AQI', 'SO2 AQI', 'CO AQI']].max(axis=1)


In [44]:
# Summary stats
data_cleaned.describe()

Unnamed: 0,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,Overall AQI
count,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0,1406162.0
mean,14.14483,27.68742,11.85542,26.05954,0.02559864,0.0392512,10.21173,36.19106,2.219325,5.321064,10.496,7.37788,0.4042216,0.6882735,8.312177,7.803868,41.60868
std,9.641468,15.95774,7.856309,15.12684,0.01149041,0.01579793,3.944444,20.47039,2.866418,8.122523,6.505151,10.85745,0.3233801,0.6716434,7.98106,7.644419,19.59111
min,0.004167,1.0,0.0,1.0,4.2e-05,0.001,0.0,1.0,0.004167,0.1,0.0,0.07507099,4.2e-05,0.001,0.0,0.001344396,5.0
25%,7.0,15.9,6.0,14.0,0.017167,0.028,9.0,25.0,0.55,1.1,6.0,1.421002,0.2,0.3,0.0,3.0,30.0
50%,12.20833,26.0,10.0,25.0,0.025292,0.038,10.0,33.0,1.225,2.6,10.0,3.726795,0.320833,0.5,6.0,5.767928,37.0
75%,19.25,38.0,20.0,36.0,0.033417,0.049,11.0,42.0,2.772727,6.0,14.0,9.0,0.504167,0.8,15.0,9.127134,46.0
max,139.5417,267.0,23.0,132.0,0.095083,0.141,23.0,218.0,321.625,351.0,23.0,532.2287,7.508333,19.9,23.0,225.6552,532.2287


In [None]:
columns_to_check = ['NO2 AQI', 'O3 AQI', 'CO AQI', 'SO2 AQI', 'Overall AQI']

upper_limit = 200
lower_limit = 100

for column in columns_to_check:
    # Calculate IQR
    Q1 = data_cleaned[column].quantile(0.25)
    Q3 = data_cleaned[column].quantile(0.75)
    IQR = Q3 - Q1

    # Calculate IQR-based bounds
    iqr_lower_bound = Q1 - 1.5 * IQR
    iqr_upper_bound = Q3 + 1.5 * IQR

    # Enforce strict bounds between 100 and 200
    effective_lower_bound = max(iqr_lower_bound, lower_limit)
    effective_upper_bound = min(iqr_upper_bound, upper_limit)

    # Filter the data
    data_cleaned = data_cleaned[
        (data_cleaned[column] >= effective_lower_bound) & 
        (data_cleaned[column] <= effective_upper_bound)
    ]

# Display summary statistics
data_cleaned.describe()

Unnamed: 0,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,Overall AQI
count,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0,761529.0
mean,10.238912,21.066721,11.701237,19.742985,0.024867,0.035432,10.103262,30.657324,1.090032,2.196911,10.073099,3.006102,0.295499,0.440814,7.811896,4.977188,32.711393
std,6.58787,11.746525,7.868921,11.086095,0.009024,0.010244,4.353477,8.870851,0.959862,1.628222,6.640169,2.364885,0.162715,0.235318,7.914098,2.731793,7.629409
min,0.004167,1.0,0.0,1.0,4.2e-05,0.001,0.0,1.0,0.004167,0.1,0.0,0.075071,4.2e-05,0.001,0.0,0.001344,10.0
25%,5.154167,11.3,5.0,10.0,0.0185,0.028,9.0,25.0,0.3625,1.0,5.0,1.0,0.1875,0.29,0.0,3.0,27.0
50%,9.086957,20.0,9.0,19.0,0.025,0.036,10.0,31.0,0.875,1.8,9.0,2.527988,0.270083,0.4,6.0,4.94965,33.0
75%,14.217391,30.0,20.0,28.0,0.031333,0.043,11.0,37.0,1.5,3.0,14.0,4.0,0.3875,0.6,13.0,6.86629,39.0
max,45.208333,49.9,23.0,46.0,0.0535,0.054,23.0,46.0,6.308696,7.8,23.0,9.976357,1.0,1.067,23.0,11.998862,46.0


In [None]:
# Making new csv with only these columns
file_path = os.path.join(os.getcwd(), 'cleaned_dataset.csv')
data_cleaned.to_csv(file_path, index=False)