In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import os

In [None]:
data = pd.read_csv('../../filtered_pollution_us_2000_2016.csv')
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# Display first few rows
data.head()

In [None]:
# Display columns
print(f"Features from deafult dataset:\n {data.columns}")

In [None]:
# Check for missing values
print(f"Missing values in each feature: \n{data.isnull().sum()}")

# Checking for duplicate rows
print(f"\nNumber of duplicate rows in dataset: \n{data.duplicated().sum()}")

# Dropping any duplicate rows
data.drop_duplicates(inplace= True)

In [None]:
# Before checking threshold for features
number_rows, number_columns = data.shape
print(f"Raw Data: {number_rows} samples and {number_columns} features")


# Removing Any Feature with >= 80% Missing Values
threshold = 0.8
retain_columns = [col for col in data.columns if data[col].isnull().mean() < threshold]
data = data[retain_columns]

print(f"Features Retained After Removing >= 80% Missing Values: {len(retain_columns)}")

In [None]:
# Solving the NaN values for SO2 AQI

# Split Data where there are NaN and Non-NaN values
features = ["SO2 Mean", "SO2 1st Max Value", "SO2 1st Max Hour"]
so2_known = data.dropna(subset=["SO2 AQI"])
so2_unknown = data[data["SO2 AQI"].isna()]

# Using Linear Regression to fill in where NaN exists
so2_model = LinearRegression().fit(so2_known[features], so2_known["SO2 AQI"])
data.loc[data["SO2 AQI"].isna(), "SO2 AQI"] = so2_model.predict(so2_unknown[features])

In [None]:
# Solving the NaN values for CO AQI

# Split Data where there are NaN and Non-NaN values
features = ["CO Mean", "CO 1st Max Value", "CO 1st Max Hour"]
co_known = data.dropna(subset=["CO AQI"])
co_unknown = data[data["CO AQI"].isna()]

# Using Linear Regression to fill in where NaN exists
co_model = LinearRegression().fit(co_known[features], co_known["CO AQI"])
data.loc[data["CO AQI"].isna(), "CO AQI"] = co_model.predict(co_unknown[features])

In [None]:
# Checking to see how many zeros before cleaning
before_cleaning_data = (data == 0).sum()
print(f"Number of zeros in each feature before cleaning: \n{before_cleaning_data}")


In [None]:
# Elimiinating any negative or zero values from the data set
data_cleaned = data[(data > 0).all(axis=1)].copy()

#Checking to see hoa many zeros after cleaning
after_cleaning_data = (data_cleaned == 0).sum()
print(f"\nNumber of zeros in each feature after cleaning: \n{after_cleaning_data}")

# Checking length of data
print(f"\nTotal length of Data after cleaning: \n{len(data_cleaned)}")

In [None]:
# Assign the Overall AQI by taking the max across the specified columns
data_cleaned.loc[:, "Overall AQI"] = data_cleaned[['NO2 AQI', 'O3 AQI', 'SO2 AQI', 'CO AQI']].max(axis=1)

In [None]:
# Summary stats
data_cleaned.describe()

In [None]:
columns_to_check = ['NO2 AQI', 'O3 AQI', 'CO AQI', 'SO2 AQI', 'Overall AQI']

upper_limit = 200

for column in data_cleaned.columns:
    
    Q1 = data_cleaned[column].quantile(0.25)
    Q3 = data_cleaned[column].quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR

    data_cleaned = data_cleaned[(data_cleaned[column] >= lower_bound) & (data_cleaned[column] <= upper_limit)]

data_cleaned.describe()

In [None]:
# Making new csv with only these columns
file_path = os.path.join(os.getcwd(), '../../cleaned_dataset.csv')
data_cleaned.to_csv(file_path, index=False)