In [1]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
#Load dataset
data=pd.read_csv("annual_aqi_by_county_2025.csv")

In [4]:
print(data.head)

<bound method NDFrame.head of        State      County  Year  Days with AQI  Good Days  Moderate Days  \
0    Alabama     Baldwin  2025             88         63             25   
1    Alabama        Clay  2025             90         74             15   
2    Alabama      DeKalb  2025            120         94             26   
3    Alabama      Elmore  2025             31         29              2   
4    Alabama      Etowah  2025             88         61             27   
..       ...         ...   ...            ...        ...            ...   
928  Wyoming    Sheridan  2025            120         96             24   
929  Wyoming    Sublette  2025            120         88             32   
930  Wyoming  Sweetwater  2025            118        110              8   
931  Wyoming       Teton  2025            177        156             21   
932  Wyoming    Washakie  2025            171        167              4   

     Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unheal

In [5]:
df=data.copy()
# Data cleaning and preprocessing
col_map_candidates = {
    'Days Good': ['Days Good', 'Days_Good', 'Days Good (AQI)'],
    'Days Moderate': ['Days Moderate', 'Days_Moderate'],
    'Days Unhealthy for Sensitive Groups': ['Days Unhealthy for Sensitive Groups', 'Days Unhealthy for Sensitive Groups'],
    'Days Unhealthy': ['Days Unhealthy', 'Days_Unhealthy'],
    'Days Very Unhealthy': ['Days Very Unhealthy', 'Days_Very_Unhealthy'],
    'Days Hazardous': ['Days Hazardous', 'Days_Hazardous'],
    'Max AQI': ['Max AQI', 'Max_AQI'],
    'Median AQI': ['Median AQI', 'Median_AQI']
}
for canonical, alts in col_map_candidates.items():
    for alt in alts:
        if alt in df.columns:
            df = df.rename(columns={alt: canonical})
            break

In [6]:
# Trim whitespace
for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].str.strip()

# Ensure State & County columns exist (attempt)
if 'State' not in df.columns:
    possible = [c for c in df.columns if 'state' in c.lower()]
    if possible:
        df = df.rename(columns={possible[0]: 'State'})

if 'County' not in df.columns:
    possible = [c for c in df.columns if 'county' in c.lower()]
    if possible:
        df = df.rename(columns={possible[0]: 'County'})

In [8]:
# Coerce numeric columns where present
numeric_cols = ['Year','Max AQI','Median AQI','Days Good','Days Moderate',
                'Days Unhealthy for Sensitive Groups','Days Unhealthy','Days Very Unhealthy','Days Hazardous']
for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Drop rows missing core info
df = df.dropna(subset=['State','County','Year'], how='any')
print("After canonicalization shape:", df.shape)
print(df.sample(5))

After canonicalization shape: (933, 18)
            State      County  Year  Days with AQI  Good Days  Moderate Days  \
809          Utah        Utah  2025            182         94             87   
725  South Dakota   Codington  2025             90         83              7   
400     Minnesota  Blue Earth  2025             82         69             13   
675  Pennsylvania         Elk  2025            120        114              6   
199         Idaho       Idaho  2025            120         84             36   

     Unhealthy for Sensitive Groups Days  Unhealthy Days  Very Unhealthy Days  \
809                                    1               0                    0   
725                                    0               0                    0   
400                                    0               0                    0   
675                                    0               0                    0   
199                                    0               0                  