In [None]:
#Import what technolgies you need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3


In [None]:
# Load the Datasets
food_df = pd.read_csv("Louisville_Metro_KY_Inspection_Results_Food_Service_Establishments_1200720516040038198.csv")
gun_df = pd.read_csv("Gun_Violence_Data_3431704539549171373.csv")

In [None]:
#Show EDA of raw data
def data_overview(df: pd.DataFrame, name: str) -> None:
    print(f"{name}")
    print("Shape:", df.shape)
    print("\nColumn Types:")
    print(df.dtypes)
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nFirst 5 Rows:")
    print(df.head())
    print("\nSummary Statistics:")
    print(df.describe(include='all'))

In [None]:
#using EDA function to see data
data_overview(food_df, "Food Inspection Data")
data_overview(gun_df, "Gun Violence Data")

In [None]:
#Clean Food Data
def clean_food_df(df: pd.DataFrame) -> pd.DataFrame:
    # Make a copy of Food Data
    df = df.copy()

    # Column names are lowercase and replace spaces with underscores
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Add a new column 'zip_code' from 'premise_zip' and format it as 5-digit strings
    df['zip_code'] = df['premise_zip'].astype(str).str.zfill(5)

    # Change 'daterecent' and 'opening_date' to proper datetime format
    df['daterecent'] = pd.to_datetime(df['daterecent'], errors='coerce')
    df['opening_date'] = pd.to_datetime(df['opening_date'], errors='coerce')

    # Remove missing values from specific columns
    df = df.dropna(subset=['scorerecent', 'graderecent', 'zip_code', 'daterecent'])

    # Replace blanks or missing with 'Unknown' and format text for 'facility_type'
    df['facility_type'] = df['facility_type'].replace('', np.nan)
    df['facility_type'] = df['facility_type'].fillna('Unknown')
    df['facility_type'] = df['facility_type'].astype(str).str.strip().str.title()

    # Fill missing city and state values with 'Unknown'
    df['premise_city'] = df['premise_city'].fillna('Unknown')
    df['premise_state'] = df['premise_state'].fillna('Unknown')

    # Fill missing facility names with 'Unnamed Facility'
    df['premise_name'] = df['premise_name'].fillna('Unnamed Facility')

    # Remove rows missing address information
    df = df.dropna(subset=['premise_address'])

    # Keep only inspections from 2020 onward
    df = df[df['daterecent'] >= pd.Timestamp('2020-01-01')]

    # Reset the index and return the cleaned DataFrame
    return df.reset_index(drop=True)

In [None]:
#Clean Gun Data
def clean_gun_df(df: pd.DataFrame) -> pd.DataFrame:
    # Make a copy of Gun Data
    df = df.copy()

    # Column names are lowercase and replace spaces with underscores
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Format ZIP codes as 5-digit strings
    df['zip_code'] = df['zip_code'].astype(str).str.zfill(5)

    # Convert the 'datetime' column to datetime format
    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

    # Remove missing values from specific columns
    df = df.dropna(subset=['zip_code', 'datetime', 'latitude', 'longitude', 'crime_type'])

    # Fill missing or blank values with 'Unknown'
    for col in ['sex', 'race', 'age_group']:
        df[col] = df[col].replace('', np.nan)            # Replace empty strings with NaN
        df[col] = df[col].fillna('Unknown')              # Fill NaN with 'Unknown'
        df[col] = df[col].astype(str).str.strip().str.title()  # Format text nicely

    # Fill missing neighborhood names with 'Unknown'
    df['neighborhood'] = df['neighborhood'].fillna('Unknown')

    # Fill missing cause descriptions with 'Unknown'
    df['cause'] = df['cause'].fillna('Unknown')

    # Keep only rows with incidents from 2020 onward
    df = df[df['datetime'] >= pd.Timestamp('2020-01-01')]

    # Create a new column for the hour of the day (0–23) from the datetime
    df['hour_of_day'] = df['datetime'].dt.hour

    # Reset the index and return the cleaned DataFrame
    return df.reset_index(drop=True)