In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
import time
import numpy as np

RAW_DATA_PATH = "./raw_data.csv"
IRRELEVANT_COLUMNS = ['Unnamed: 0', 'Address']
CLEANED_DATA_FILE_PATH = "./clean_data.csv"

# ORM Stuff
from sqlalchemy import create_engine, inspect, text

In [3]:
#Import data
df = pd.read_csv(RAW_DATA_PATH)

#Start cleaning process
df.drop(IRRELEVANT_COLUMNS, axis=1, inplace=True)
df = df[df['City'] != "Not in a city"]
df.loc[df['City'] == 'Dentsville (Dents)', 'City'] = 'Dentsville'
df.loc[df['City'] == 'Lexington-Fayette (corporate name for Lexington)', 'City'] = 'Lexington'
df.loc[df['City'] == 'Indianapolis (Remainder)', 'City'] = 'Indianapolis'
df.loc[df['City'] == 'Calumet City (PU RR name Calumet Park (sta.))', 'City'] = 'Calumet City'

In [4]:
unique_cities = df[['City', 'State']].drop_duplicates()

In [7]:
# Create a geolocator instance with a custom user_agent
geolocator = Nominatim(user_agent="city_geocoder")

# Dictionary to store results
city_coordinates = {}

for i, row in unique_cities.iterrows():
    # Build the query string with city, state, and country
    query = f"{row['City']}, {row['State']}, USA"
    location = geolocator.geocode(query)
    
    if location:
        city_coordinates[row['City']] = (location.latitude, location.longitude)
        print(f"{query}: {location.latitude}, {location.longitude}")
    else:
        not_found.append(row['City'])
        print(f"Location not found for {query}")
    
    time.sleep(1)

Phoenix, Arizona, USA: 33.4484367, -112.074141
Scottsdale, Arizona, USA: 33.4942189, -111.926018
Tucson, Arizona, USA: 32.2228765, -110.974847
Concord, California, USA: 37.9768525, -122.0335624
Bethel Island, California, USA: 38.029033999999996, -121.64000881834863
San Pablo, California, USA: 37.9621457, -122.3455263
Pittsburg, California, USA: 38.0181745, -121.8901232
Calexico, California, USA: 32.6668134, -115.4963754
Bakersfield, California, USA: 35.3738712, -119.019463
Burbank, California, USA: 34.1812089, -118.307201
Los Angeles, California, USA: 34.0536909, -118.242766
Long Beach, California, USA: 33.7690164, -118.191604
Hawthorne, California, USA: 33.9128272, -118.3426122
Costa Mesa, California, USA: 33.6633386, -117.903317
Rubidoux, California, USA: 33.99431195, -117.42376599221046
Arden-Arcade, California, USA: 38.6039613, -121.38300352999352
Victorville, California, USA: 34.5361067, -117.2911565
Chula Vista, California, USA: 32.6400541, -117.084195
San Diego, California, USA:

In [8]:
city_coordinates

{'Phoenix': (33.4484367, -112.074141),
 'Scottsdale': (33.4942189, -111.926018),
 'Tucson': (32.2228765, -110.974847),
 'Concord': (37.9768525, -122.0335624),
 'Bethel Island': (38.029033999999996, -121.64000881834863),
 'San Pablo': (37.9621457, -122.3455263),
 'Pittsburg': (38.0181745, -121.8901232),
 'Calexico': (32.6668134, -115.4963754),
 'Bakersfield': (35.3738712, -119.019463),
 'Burbank': (34.1812089, -118.307201),
 'Los Angeles': (34.0536909, -118.242766),
 'Long Beach': (33.7690164, -118.191604),
 'Hawthorne': (33.9128272, -118.3426122),
 'Costa Mesa': (33.6633386, -117.903317),
 'Rubidoux': (33.99431195, -117.42376599221046),
 'Arden-Arcade': (38.6039613, -121.38300352999352),
 'Victorville': (34.5361067, -117.2911565),
 'Chula Vista': (32.6400541, -117.084195),
 'San Diego': (32.7174202, -117.162772),
 'San Francisco': (37.7792588, -122.4193286),
 'Capitan': (34.463564, -120.0432487),
 'Lompoc': (34.6391501, -120.4579009),
 'Goleta': (34.4358295, -119.8276389),
 'Vandenberg

In [9]:
latitude = []
longitude = []

for city in df.City:
    (lat, long) = city_coordinates[city]
    longitude.append(long)
    latitude.append(lat)

In [13]:
df['Latitude'] = latitude

In [15]:
df['Longitude'] = longitude

In [17]:
# Convert "Date Local" to datetime format
df["Date"] = pd.to_datetime(df["Date"], errors='coerce', infer_datetime_format=True)

# Extract Year and Month, ensuring they are integers
df["Year"] = df["Date"].dt.year.astype("Int64")   # Nullable integer type
df["Month"] = df["Date"].dt.month.astype("Int64")  # Ensures no floating-point month
df["Day"] = df["Date"].dt.day.astype("Int64")  # Ensures no floating-point month

  df["Date"] = pd.to_datetime(df["Date"], errors='coerce', infer_datetime_format=True)


In [19]:
df.head()

Unnamed: 0,Date,State,County,City,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,CO Mean,CO 1st Max Value,...,SO2 AQI,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,Latitude,Longitude,Year,Month,Day
0,2000-01-01,Arizona,Maricopa,Phoenix,0.019765,0.04,10,37,0.878947,2.2,...,13.0,19.041667,49.0,19,46,33.448437,-112.074141,2000,1,1
1,2000-01-02,Arizona,Maricopa,Phoenix,0.015882,0.032,10,30,1.066667,2.3,...,4.0,22.958333,36.0,19,34,33.448437,-112.074141,2000,1,2
2,2000-01-03,Arizona,Maricopa,Phoenix,0.009353,0.016,9,15,1.7625,2.5,...,16.0,38.125,51.0,8,48,33.448437,-112.074141,2000,1,3
3,2000-01-04,Arizona,Maricopa,Phoenix,0.015882,0.033,9,31,1.829167,3.0,...,23.0,40.26087,74.0,8,72,33.448437,-112.074141,2000,1,4
4,2000-01-05,Arizona,Maricopa,Phoenix,0.007353,0.012,9,11,2.7,3.7,...,21.0,48.45,61.0,22,58,33.448437,-112.074141,2000,1,5


In [21]:
df.to_csv(CLEANED_DATA_FILE_PATH)

In [None]:
# Load the dataset
df = pd.read_csv("clean_data.csv")  # Replace with the correct file path

# Ensure 'Year' is in numeric format
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Keep only records where the Year is between 2013 and 2023
df_filtered = df[(df['Year'] >= 2013) & (df['Year'] <= 2023)]

# Save the filtered dataset if needed
df_filtered.to_csv("filtered_data.csv", index=False)

# Display the first few rows to verify
print(df_filtered.head())

In [None]:
# WRITE TO DATABASE

In [None]:
engine = create_engine("sqlite:///Pollution.sqlite")

In [None]:
# Random sample for SPEED
df2 = df.sample(500)
df2.head()

In [None]:
# Write to SQL (NOTE I AM USING con=engine) THIS IS WHAT WORKS APPARENTLY WHEN WRITING OUT DATA
df2.to_sql(name="pollution", con=engine, index=False, if_exists="append", method="multi")

In [None]:
# Create the inspector and connect it to the engine
inspector = inspect(engine)

# Collect the names of tables within the database
tables = inspector.get_table_names()

# Using the inspector to print the column names within the 'dow' table and its types
for table in tables:
    print(table)
    print("--------")
    columns = inspector.get_columns(table)
    for column in columns:
        print(column["name"], column["type"])

    print()

In [None]:
engine.dispose()