In [None]:
#Import Libraries
import pandas as pd
from sqlalchemy import create_engine
import os

db_url = os.getenv("DATABASE_URL")

In [2]:
df = pd.read_csv("Combined_Houston_311.csv")

print(df.head())
print(df.shape)
print(df.info())

  df = pd.read_csv("Combined_Houston_311.csv")


             CASE NUMBER                              NEIGHBORHOOD  \
0  12091834-101002444724              EAST LITTLE YORK / HOMESTEAD   
1  12091835-101002444725                       NORTHSIDE/NORTHLINE   
2           101002444726                                  MID WEST   
3           101002444727  WASHINGTON AVENUE COALITION / MEMORIAL P   
4    169116-101002444728                                   Unknown   

                     DEPARTMENT                           DIVISION  \
0  PWE Public Works Engineering                PU Public Utilities   
1  PWE Public Works Engineering                PU Public Utilities   
2  PWE Public Works Engineering                 Traffic Operations   
3  PWE Public Works Engineering  PDS Planning Development Services   
4                  311 HelpLine                  311 Call Handling   

                            CASE TYPE         CREATED DATE  \
0                        Fire Hydrant  2017-01-01 00:01:48   
1                        Fire Hydr

In [3]:
df.columns

Index(['CASE NUMBER', 'NEIGHBORHOOD', 'DEPARTMENT', 'DIVISION', 'CASE TYPE',
       'CREATED DATE', 'CLOSED DATE', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [4]:
#look for duplicates
duplicates = df[df.duplicated(subset=["CASE NUMBER"], keep=False)]
print(duplicates)

                CASE NUMBER                              NEIGHBORHOOD  \
3580     42679-101002448917                        GREATER FIFTH WARD   
3581     42679-101002448917                        GREATER FIFTH WARD   
3584           101002448920                        GREATER FIFTH WARD   
3585           101002448920                        GREATER FIFTH WARD   
49121          101002501751  WASHINGTON AVENUE COALITION / MEMORIAL P   
...                     ...                                       ...   
3670303          2400354189                         CENTRAL SOUTHWEST   
3670304          2400354189                         CENTRAL SOUTHWEST   
3670305          2400354188                                  MINNETEX   
3670306          2400354188                                  MINNETEX   
3835048                 NaN                                       NaN   

                           DEPARTMENT                           DIVISION  \
3580         NS Neighborhood Services          

In [5]:
#drops rows with a case number but has null vales everywhere else
df = df.dropna(subset=df.columns.difference(["CASE NUMBER"]), how="all")

#keep last duplicate entry
df = df.drop_duplicates(subset=["CASE NUMBER"], keep="last")

In [6]:
df.dtypes

CASE NUMBER     object
NEIGHBORHOOD    object
DEPARTMENT      object
DIVISION        object
CASE TYPE       object
CREATED DATE    object
CLOSED DATE     object
LATITUDE        object
LONGITUDE       object
dtype: object

In [7]:
#converting dates to datetime format
date_cols = ["CREATED DATE", "CLOSED DATE"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")  # invalid : NaT

#converting numeric columns to numeric types
num_cols = ["LATITUDE", "LONGITUDE"]
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")  # invalid : NaN

#convert categorial columns into categories
cat_cols = ['CASE NUMBER', 'NEIGHBORHOOD', 'DEPARTMENT', 'DIVISION', 'CASE TYPE']
for col in cat_cols:
    df[col] = df[col].astype("category")

print(df.tail)

<bound method NDFrame.tail of                    CASE NUMBER                              NEIGHBORHOOD  \
0        12091834-101002444724              EAST LITTLE YORK / HOMESTEAD   
1        12091835-101002444725                       NORTHSIDE/NORTHLINE   
2                 101002444726                                  MID WEST   
3                 101002444727  WASHINGTON AVENUE COALITION / MEMORIAL P   
4          169116-101002444728                                   Unknown   
...                        ...                                       ...   
3835043      217348-2400490914                                  WESTBURY   
3835044    12670427-2400490912                 GREATER OST / SOUTH UNION   
3835045             2400490911                                     ALIEF   
3835046    20480340-2400490910                                NORTHSHORE   
3835047    12670426-2400490909                       NORTHSIDE/NORTHLINE   

                           DEPARTMENT                    

In [8]:
#Confirm data types
df.dtypes

CASE NUMBER           category
NEIGHBORHOOD          category
DEPARTMENT            category
DIVISION              category
CASE TYPE             category
CREATED DATE    datetime64[ns]
CLOSED DATE     datetime64[ns]
LATITUDE               float64
LONGITUDE              float64
dtype: object

In [9]:
# filter rows outside Houston bounds
outside_houston = df[
    ~df['LATITUDE'].between(29.5, 30.1) |
    ~df['LONGITUDE'].between(-95.9, -94.9)
]

# count how many
print(f"Number of reports outside Houston: {len(outside_houston)}")

# drop them
df = df.drop(outside_houston.index)
print(df.head())
print(df.shape) 

Number of reports outside Houston: 34148
             CASE NUMBER                              NEIGHBORHOOD  \
0  12091834-101002444724              EAST LITTLE YORK / HOMESTEAD   
1  12091835-101002444725                       NORTHSIDE/NORTHLINE   
2           101002444726                                  MID WEST   
3           101002444727  WASHINGTON AVENUE COALITION / MEMORIAL P   
6  12091836-101002444730                            GREATER UPTOWN   

                     DEPARTMENT                           DIVISION  \
0  PWE Public Works Engineering                PU Public Utilities   
1  PWE Public Works Engineering                PU Public Utilities   
2  PWE Public Works Engineering                 Traffic Operations   
3  PWE Public Works Engineering  PDS Planning Development Services   
6  PWE Public Works Engineering                PU Public Utilities   

                            CASE TYPE        CREATED DATE         CLOSED DATE  \
0                        Fire Hydran

In [10]:
for case_type in df["CASE TYPE"].unique():
    print(case_type)


Fire Hydrant
Traffic Signal Maintenance
MultiFamily Habitability Violation
Water Leak
Water Main Valve
Sewer Wastewater
Water Service
Graffiti Private or Commercial Property
Dead Animal Collection
Street Hazard
PWE CIP Project
Street Condition
Heavy Trash Violation
Parking Violation
Pothole
Missed Garbage Pickup
Trash Dumping or Illegal Dumpsite
Restoration Due To Utility Work
Amenity New
Container Problem
City Engineer
New Resident Container
Recycling Participation NEW
Nuisance On Property
Traffic Signs
Traffic Markings
Building or Sign Code
Drainage
Recycling Cart Repair or Replace
Junk Motor Vehicle
Traffic Signals
Water Quality
Traffic Bridge/Freeway Lighting
Unclassified 311 Web Request
New Move In Service
Missed Heavy Trash Pickup
Flooding
Missed Recycling Pickup
Graffiti - City Street/Curb
Crisis Cleanup
Storm Debris Collection
Tree Trim
Sewer Manhole
Parking Meter
Traffic General
Recycle Bin/Cart Retrieve
Tree Removal
Missed Yard Waste Pickup
Add A Can
Bridge and Barricade
Tree

In [11]:
# mapping dictionary
mapping = {
    # Water Service & Leaks
    "Water Leak": "Water Service & Leaks",
    "Major Water Leak": "Water Service & Leaks",
    "Minor Water Leak": "Water Service & Leaks",
    "Water Main Valve": "Water Service & Leaks",
    "Water Service": "Water Service & Leaks",
    "Water Meter": "Water Service & Leaks",
    "Meter Investigation": "Water Service & Leaks",
    "Private Leak Violation": "Water Service & Leaks",
    "Water Quality": "Water Service & Leaks",
    "Drinking Water Public Notice": "Water Service & Leaks",
    "Water Shortage Violation": "Water Service & Leaks",
    "Water Adjustment Board Review": "Water Service & Leaks",
    "Restoration due to Meter Work": "Water Service & Leaks",
    'Clinton Water Leak': "Water Service & Leaks",
    'Restoration Due to Utility Work': "Water Service & Leaks",
    'Set Usage Water Billing': "Water Service & Leaks",
    'Water/Sewer/Drainage Billing': "Water Service & Leaks",
    'Fountain Repair': "Water Service & Leaks",
    'Fountain Repair Urgent': "Water Service & Leaks",
    'Water Playground Repair': "Water Service & Leaks",
    'Pool Water Quality Control': "Water Service & Leaks",

    # Sewer & Wastewater
    "Sewer Wastewater": "Sewer & Wastewater",
    "Sewer Wastewater Repair": "Sewer & Wastewater",
    "Sewer Manhole": "Sewer & Wastewater",

    # Drainage & Flooding
    "Drainage": "Drainage & Flooding",
    "Poor Drainage": "Drainage & Flooding",
    "Flooding": "Drainage & Flooding",
    "Drainage System Violation": "Drainage & Flooding",
    "Floodplain": "Drainage & Flooding",
    "Storm Debris Collection": "Drainage & Flooding",
    "Spilled Debris": "Drainage & Flooding",

    # Trash & Recycling
    "Missed Garbage Pickup": "Trash & Recycling",
    "Missed Recycling Pickup": "Trash & Recycling",
    "Missed Yard Waste Pickup": "Trash & Recycling",
    "Recycling Cart Repair": "Trash & Recycling",
    "Recycling Cart Replace": "Trash & Recycling",
    "Recycling Participation NEW": "Trash & Recycling",
    "Recycling Program Start Up": "Trash & Recycling",
    "Recycle Bin/Cart Retrieve": "Trash & Recycling",
    "Recycle Mascot Appearance": "Trash & Recycling",
    "Add A Can": "Trash & Recycling",
    "Add a Can": "Trash & Recycling",
    "Add A Can CANCELLATION": "Trash & Recycling",
    "Add A Can Cancellation": "Trash & Recycling",
    "Add A Cart": "Trash & Recycling",
    "Add A Cart Cancellation": "Trash & Recycling",
    "Add A Cart CANCELLATION": "Trash & Recycling",
    "Add A Bin": "Trash & Recycling",
    "Container Problem": "Trash & Recycling",
    "Container Repair": "Trash & Recycling",
    "Container Replacement": "Trash & Recycling",
    "Container Placement": "Trash & Recycling",
    "Trash Can New": "Trash & Recycling",
    "New Resident Container": "Trash & Recycling",
    "Non Residential Collection Service NEW": "Trash & Recycling",
    "Non Residential Collection CANCEL": "Trash & Recycling",
    "Automated Recycling for Business NEW": "Trash & Recycling",
    'Add a Can Cancellation': "Trash & Recycling",
    'Recycle Program Start Up': "Trash & Recycling",
    'Recycling Cart Repair or Replace': "Trash & Recycling",
    'Order Bag Tags': "Trash & Recycling",

    # Heavy Trash & Code Violations
    "Missed Heavy Trash Pickup": "Heavy Trash & Code Violations",
    "Heavy Trash Violation": "Heavy Trash & Code Violations",
    "Heavy Trash Code Violation": "Heavy Trash & Code Violations",
    "Junk Motor Vehicle - Private Property": "Heavy Trash & Code Violations",
    "Weeds/Trash/Stagnant Water on Property": "Heavy Trash & Code Violations",

    # Road & Street Maintenance
    "Pothole": "Road & Street Maintenance",
    "Street Hazard": "Road & Street Maintenance",
    "Street Condition": "Road & Street Maintenance",
    "Bridge and Barricade": "Road & Street Maintenance",
    "Roadway Icing": "Road & Street Maintenance",
    "Icing Bridge": "Road & Street Maintenance",
    "Icing Roadway": "Road & Street Maintenance",
    "Restoration Due To Utility Work": "Road & Street Maintenance",
    "Parking Lot Pothole": "Road & Street Maintenance",
    "Street Assessment": "Road & Street Maintenance",
    'SPARK Park': "Parks & Recreation",
    'Clean Pavilion or Restroom': "Parks & Recreation",
    'Dugout Repair': "Parks & Recreation",
    'Soccer Goal': "Parks & Recreation",
    'Portacan Problem': "Parks & Recreation",
    'Sprinkler Repair': "Parks & Recreation",

    # Traffic Signals & Signs
    "Traffic Signal Maintenance": "Traffic Signals & Signs",
    "Traffic Signals": "Traffic Signals & Signs",
    "Traffic Signs": "Traffic Signals & Signs",
    "Traffic Markings": "Traffic Signals & Signs",
    "Flashing School Beacon": "Traffic Signals & Signs",
    "Malfunctioning Railroad Signal": "Traffic Signals & Signs",
    "Report Train Noise": "Traffic Signals & Signs",
    "Blocked Train Crossing": "Traffic Signals & Signs",
    'Traffic General': "Traffic Signals & Signs",

    # Street Lighting
    "Lighting": "Street Lighting",
    "Traffic Bridge/Freeway Lighting": "Street Lighting",

    # Parking Enforcement
    "Parking Violation": "Parking Enforcement",
    "Parking Meter": "Parking Enforcement",
    "Parking Escalation": "Parking Enforcement",
    "Parking Enforcement": "Parking Enforcement",

    # Trees & Forestry
    "Tree Trim": "Trees & Forestry",
    "Tree Removal": "Trees & Forestry",
    "Tree Code Violation": "Trees & Forestry",
    "Tree Stump Grinding": "Trees & Forestry",
    "Tree Planting": "Trees & Forestry",
    "Severe Storm Tree Removal": "Trees & Forestry",

    # Parks & Recreation
    "Amenity Repair": "Parks & Recreation",
    "Amenity New": "Parks & Recreation",
    "Playground or Court Equipment": "Parks & Recreation",
    "Playground or Court Equipment Repair": "Parks & Recreation",
    "Park Facilities Escalation": "Parks & Recreation",
    "Park Greenspace Escalation": "Parks & Recreation",
    "Park Forestry Escalation": "Parks & Recreation",
    "Mow Park": "Parks & Recreation",
    "Mow Esplanade or Median": "Parks & Recreation",
    "Litter Park": "Parks & Recreation",
    "Litter Esplanade Median": "Parks & Recreation",

    # Building & Code Enforcement
    "Building Code Violation": "Building & Code Enforcement",
    "Building or Sign Code": "Building & Code Enforcement",
    "Health Code": "Building & Code Enforcement",
    "Sign Code Violation": "Building & Code Enforcement",
    "Minimum Standards": "Building & Code Enforcement",
    "Dangerous Commercial Building": "Building & Code Enforcement",
    "Occupancy Violation": "Building & Code Enforcement",
    'Boarding Homes': "Building & Code Enforcement",
    'Donation Box Violation': "Building & Code Enforcement",
    'Dumpster Complaint': "Building & Code Enforcement",
    'Dumpster Permit': "Building & Code Enforcement",
    'Fire Code Complaint': "Building & Code Enforcement",
    'Electrical Hazard': "Building & Code Enforcement",
    'Pool Fence': "Building & Code Enforcement",
    'Property Damage': "Building & Code Enforcement",
    'Identify Numbered Address': "Building & Code Enforcement",
    'Building Break In' : "Building & Code Enforcement",
    'Ashby Highrise Concern' : "Building & Code Enforcement",
    'Dangerous Buildings' : "Building & Code Enforcement",
    'HPW CCE Escalation' : "Building & Code Enforcement",
    'Minimum Standards - Residence' : "Building & Code Enforcement",

    # Illegal Dumping & Nuisance
    "Trash Dumping or Illegal Dumpsite": "Illegal Dumping & Nuisance",
    "Nuisance On Property": "Illegal Dumping & Nuisance",
    "Nuisance on Commercial Property": "Illegal Dumping & Nuisance",
    "Junk Motor Vehicle": "Illegal Dumping & Nuisance",
    "Bandit Sign": "Illegal Dumping & Nuisance",

    # Environmental & Pollution
    "Air Pollution": "Environmental & Pollution",
    "Water or Ground Pollution": "Environmental & Pollution",
    "Environmental Health Escalation": "Environmental & Pollution",
    "Fuel Island" : "Environmental & Pollution",

    # Storm Damage & Disaster Recovery
    "Storm Damage": "Storm Damage & Disaster Recovery",
    "Disaster Recovery": "Storm Damage & Disaster Recovery",
    "Disaster Recovery Escalation": "Storm Damage & Disaster Recovery",
    "Crisis Cleanup": "Storm Damage & Disaster Recovery",
    "Beryl Power Outage": "Storm Damage & Disaster Recovery",
    "Evacuation Transportation" : "Storm Damage & Disaster Recovery",
    "Medical Evacuation" : "Storm Damage & Disaster Recovery",
    "Reunification Missing" : "Storm Damage & Disaster Recovery",
    "Disaster Preparedness Guide Request" : "Storm Damage & Disaster Recovery",
    "Gessner Explosion" : "Storm Damage & Disaster Recovery",

    # Public Works Projects
    "PWE CIP Project": "Public Works Projects",
    "PW CIP Project": "Public Works Projects",
    "City Engineer": "Public Works Projects",
    "PWE Escalation": "Public Works Projects",
    'New CIP Request': "Public Works Projects",
    'SWM Escalation': "Public Works Projects",
    'HPW Escalation': "Public Works Projects",

    # Neighborhood Services
    "Neighborhood Clean up": "Neighborhood Services",
    "Neighborhood Updates Subscribe": "Neighborhood Services",
    "Neighborhood Updates Unsubscribe": "Neighborhood Services",
    "Neighborhoods Updates - Subscribe": "Neighborhood Services",
    "Neighborhoods Updates - Unsubscribe": "Neighborhood Services",
    "New Move In Service": "Neighborhood Services",
    "New Resident in Private Development": "Neighborhood Services",

    # Animal & Pest Control
    "Dead Animal Collection": "Animal & Pest Control",
    "Pest Control": "Animal & Pest Control",

    # Public Health & Housing
    "MultiFamily Habitability Violation": "Public Health & Housing",
    "Unregulated Boarding House": "Public Health & Housing",
    "Unregulated Residential Facility": "Public Health & Housing",
    "Crisis Housing": "Public Health & Housing",
    "Eviction Notice": "Public Health & Housing",
    "Multi-Resident Facility": "Public Health & Housing",

    # Administrative & Mayorâ€™s Office
    "MYR HR": "Administrative & Mayorâ€™s Office",
    "MYR PW": "Administrative & Mayorâ€™s Office",
    "MYR HPD": "Administrative & Mayorâ€™s Office",
    "MYR HFD": "Administrative & Mayorâ€™s Office",
    "MYR HLT": "Administrative & Mayorâ€™s Office",
    "MYR NS": "Administrative & Mayorâ€™s Office",
    "MYR MCD": "Administrative & Mayorâ€™s Office",
    "MYR LGL": "Administrative & Mayorâ€™s Office",
    "MYR GSD": "Administrative & Mayorâ€™s Office",
    "MYR IT": "Administrative & Mayorâ€™s Office",
    "MYR Communications": "Administrative & Mayorâ€™s Office",
    "MYR AVA": "Administrative & Mayorâ€™s Office",
    "MYR OBO": "Administrative & Mayorâ€™s Office",
    "MYR HCD": "Administrative & Mayorâ€™s Office",
    "MYR PD": "Administrative & Mayorâ€™s Office",
    "MYR LIB": "Administrative & Mayorâ€™s Office",
    "MYR FIN": "Administrative & Mayorâ€™s Office",
    "MYR Correspondence": "Administrative & Mayorâ€™s Office",
    'MYR PR': "Administrative & Mayorâ€™s Office",
    "MAO General": "Administrative & Mayorâ€™s Office",
    "MAO Meeting": "Administrative & Mayorâ€™s Office",
    "MAO Events": "Administrative & Mayorâ€™s Office",
    "MAO Public Session": "Administrative & Mayorâ€™s Office",
    'MYR ARA': "Administrative & Mayorâ€™s Office",
    'MYR HEC': "Administrative & Mayorâ€™s Office",
    'MYR SWM': "Administrative & Mayorâ€™s Office",
    'MYR Public Safety': "Administrative & Mayorâ€™s Office",
    'MOPD Disability Office': "Administrative & Mayorâ€™s Office",
    'MOPD Events': "Administrative & Mayorâ€™s Office",
    'MOPD General': "Administrative & Mayorâ€™s Office",
    'MOPD Meetings': "Administrative & Mayorâ€™s Office",
    'Vital Statistics Escalation': "Administrative & Mayorâ€™s Office",
    'MYR HPD Illegal Dumping' : "Administrative & Mayorâ€™s Office",
    'MYR 311' : "Administrative & Mayorâ€™s Office",
    'MYR CAO' : "Administrative & Mayorâ€™s Office",
    'MYR HPW CCE' : "Administrative & Mayorâ€™s Office",

    # Complaints / Liaison
    "Employee Complaint": "Complaints / Liaison",
    "Liaison Note": "Complaints / Liaison",
    "Liaison Comm": "Complaints / Liaison",
    "Follow-up": "Complaints / Liaison",
    "Miss Complaint": "Complaints / Liaison",
    "Human Trafficking Prevention Violation": "Complaints / Liaison",

    # Transportation & Safety
    "Traffic Applications": "Transportation & Safety",
    "Traffic Programs": "Transportation & Safety",
    "Traffic Safety": "Transportation & Safety",
    "Traffic School Zones": "Transportation & Safety",
    "Traffic School Zone": "Transportation & Safety",
    "Speed Cushion Repair": "Transportation & Safety",
    "Bike Lane Maintenance": "Transportation & Safety",
    'Vehicle for Hire Complaint': "Transportation & Safety",
    'Bandit Sign - Right of Way': "Transportation & Safety",

    # Public Infrastructure / Engineering
    "Post Replace or Install": "Public Infrastructure / Engineering",
    "Fence Repair": "Public Infrastructure / Engineering",
    "Court Surface Repair": "Public Infrastructure / Engineering",
    'Fire Hydrant': "Public Infrastructure / Engineering",
    'Hole in Unpaved Ground': "Public Infrastructure / Engineering",
    'Sidewalk Repair': "Public Infrastructure / Engineering",
    'Paint Stripes': "Public Infrastructure / Engineering",
    'MayorSidewalk': "Public Infrastructure / Engineering",
    'Trail Repair': "Public Infrastructure / Engineering",
    'Pothole Resolution Contact': "Public Infrastructure / Engineering",
    'House Move Route': "Public Infrastructure / Engineering",

    # Other / Unclassified
    "Unclassified 311 Web Request": "Other / Unclassified",
    "Other": "Other / Unclassified",
    "Test": "Other / Unclassified",
    '311 Other': "Other / Unclassified",
    'Case Update': "Other / Unclassified",
    'DPW': "Other / Unclassified",
    'TPIA': "Other / Unclassified",
    'Subpoena': "Other / Unclassified",
    'Administrative Hearing': "Other / Unclassified",
    'Administrative Review': "Other / Unclassified",
    'Language Interpretation': "Other / Unclassified",
    'Complaint Form Request': "Other / Unclassified",
    'GRO Inquiry': "Other / Unclassified",
    'Neighborhoods Escalation': "Other / Unclassified",
}

# Apply the mapping dictionary to create a new 'Category' column
df["CATEGORY"] = df["CASE TYPE"].map(mapping).fillna("Uncategorized")

# Graffiti Mapping
df.loc[df["CASE TYPE"].str.contains("Graffiti", case=False), "CATEGORY"] = "Graffiti"

# Check which CASE TYPEs are still uncategorized
uncategorized = df.loc[df["CATEGORY"] == "Uncategorized", "CASE TYPE"].unique()

# Quick summary
print(sorted(df['CATEGORY'].unique()))
print(sorted(uncategorized))

['Administrative & Mayorâ€™s Office', 'Animal & Pest Control', 'Building & Code Enforcement', 'Complaints / Liaison', 'Drainage & Flooding', 'Environmental & Pollution', 'Graffiti', 'Heavy Trash & Code Violations', 'Illegal Dumping & Nuisance', 'Neighborhood Services', 'Other / Unclassified', 'Parking Enforcement', 'Parks & Recreation', 'Public Health & Housing', 'Public Infrastructure / Engineering', 'Public Works Projects', 'Road & Street Maintenance', 'Sewer & Wastewater', 'Storm Damage & Disaster Recovery', 'Street Lighting', 'Traffic Signals & Signs', 'Transportation & Safety', 'Trash & Recycling', 'Trees & Forestry', 'Water Service & Leaks']
[]


In [12]:
num_missing = df["CASE TYPE"].isna().sum()
print(f"Number of missing neighborhoods: {num_missing}")

Number of missing neighborhoods: 0


In [13]:
df["Year"] = df["CREATED DATE"].dt.year

cols = ["DEPARTMENT", "DIVISION", "NEIGHBORHOOD"]

new_after_2021 = {}

for col in cols:
    pre_2021 = set(df.loc[df["Year"] < 2021, col].dropna().unique())
    post_2021 = set(df.loc[df["Year"] >= 2021, col].dropna().unique())

    new_after_2021[col] = sorted(list(post_2021 - pre_2021))

gone_after_2021 = {}

for col in cols:
    pre_2021 = set(df.loc[df["Year"] < 2021, col].dropna().unique())
    post_2021 = set(df.loc[df["Year"] > 2021, col].dropna().unique())

    gone_after_2021[col] = sorted(list(pre_2021 - post_2021))

for label, results in new_after_2021.items():
    print(f"\nðŸ”¹ {label} â€” New after 2021:")
    print(results or "None")

for label, results in gone_after_2021.items():
    print(f"\nðŸ”¸ {label} â€” Gone after 2021:")
    print(results or "None")



ðŸ”¹ DEPARTMENT â€” New after 2021:
['Administration and Regulatory Affairs', 'Emergency Management', 'Fleet Management', 'General Services', 'Health', 'Housing Community Development', 'Houston Fire Department', 'Houston Police Department', 'Legal', 'METRO Metropolitan Transportation Authority', 'Mayor Office', 'Neighborhoods', 'Parking Management', 'Parks and Recreation', 'Public Works', 'Solid Waste Management']

ðŸ”¹ DIVISION â€” New after 2021:
['Capital Projects', 'Capitol Improvement Planning', 'Community Code Enforcement', 'Customer Account Services', 'Customer Satisfaction', 'Emergency Evacuation', 'Emergency Transportation', 'Facilities and Maintenance', 'Government Relationship Office', 'Houston Permitting Center', 'Houston Water', 'Inspector General', 'MCI Event', 'Mayor Mail', "Mayor's Assistance Office", 'People With Disabilities', 'Public Information Office', 'Transportation Drainage Operations']

ðŸ”¹ NEIGHBORHOOD â€” New after 2021:
['FB CAD #6', 'Greater Greenspoint M

In [14]:
dept_rename = {
    "ARA Administration and Regulatory Affair": "Administration and Regulatory Affairs",
    "EM Emergency Management": "Emergency Management",
    "FLT Fleet Management": "Fleet Management",
    "GS General Services": "General Services",
    "HLT Health": "Health",
    "HCD Housing Community Development": "Housing Community Development",
    "HFD Houston Fire Department": "Houston Fire Department",
    "HPD Houston Police Department": "Houston Police Department",
    "NS Neighborhood Services": "Neighborhoods",
    "OIG Office of Inspector General" : "Legal",
    "PM Parking Management": "Parking Management",
    "PR Parks and Recreation": "Parks and Recreation",
    "PWE Public Works Engineering": "Public Works",
    "SWM Solid Waste Management": "Solid Waste Management",
}

df["DEPARTMENT"] = df["DEPARTMENT"].replace(dept_rename)


  df["DEPARTMENT"] = df["DEPARTMENT"].replace(dept_rename)


In [15]:
div_rename = {
    "EC Engineering Construction": "Capital Projects",
    "Evacuation": "Emergency Evacuation",
    "FM Facilities and Maintenance": "Facilities and Maintenance",
    "PIO Public Information Office": "Public Information Office",
    "Legal": "Inspector General",
    "PDS Planning Development Services": "Capitol Improvement Planning",
    "PU Public Utilities": "Houston Water",
    "Street and Drainage": "Transportation Drainage Operations",
    "Traffic Operations": "Transportation Drainage Operations",
    "Community Improvement": "Capitol Improvement Planning"
}

df["DIVISION"] = df["DIVISION"].replace(div_rename)

  df["DIVISION"] = df["DIVISION"].replace(div_rename)


In [16]:
num_missing = df["CASE TYPE"].isna().sum()
print(f"Number of missing neighborhoods: {num_missing}")

Number of missing neighborhoods: 0


In [17]:
# Apply the Neighborhood mapping
df['NEIGHBORHOOD'] = df['NEIGHBORHOOD'].replace({
    'HARRISBURG / MANCHESTER / SMITH ADDITION': 'HARRISBURG / MANCHESTER',
    'BRIARFOREST AREA': 'BRIAR FOREST',
    'BRAESWOOD PLACE': 'BRAESWOOD',
    'NORTHSIDE VILLAGE': 'NEAR NORTHSIDE',
    'OST / SOUTH UNION' : 'GREATER OST / SOUTH UNION',
    'WASHINGTON AVENUE COALITION / MEMORIAL P' : 'WASHINGTON AVENUE COALITION / MEMORIAL PARK',
    'WILLOW MEADOWS / WILLOWBEND AREA': 'NEAR SOUTHWEST'
})

  df['NEIGHBORHOOD'] = df['NEIGHBORHOOD'].replace({


In [18]:
num_missing = df["CASE TYPE"].isna().sum()
print(f"Number of missing neighborhoods: {num_missing}")

Number of missing neighborhoods: 0


In [19]:
num_unknown = (df["NEIGHBORHOOD"] == "Unknown").sum()
print(num_unknown)

0


In [20]:
bad_departments = ['Test', 'Aviation']
bad_divisions = ['Test']
bad_neighborhoods = ['Unknown', 'FB CAD #6', 'Greater Greenspoint MD']

to_drop = df[
    df["DEPARTMENT"].isin(bad_departments) |
    df["DIVISION"].isin(bad_divisions) |
    df["NEIGHBORHOOD"].isin(bad_neighborhoods)
].index

df = df.drop(to_drop)

In [21]:
num_missing = df["CASE TYPE"].isna().sum()
print(f"Number of missing neighborhoods: {num_missing}")

Number of missing neighborhoods: 0


In [22]:
new_after_2021 = {}

for col in cols:
    pre_2021 = set(df.loc[df["Year"] < 2021, col].dropna().unique())
    post_2021 = set(df.loc[df["Year"] >= 2021, col].dropna().unique())

    new_after_2021[col] = sorted(list(post_2021 - pre_2021))

gone_after_2021 = {}

for col in cols:
    pre_2021 = set(df.loc[df["Year"] < 2021, col].dropna().unique())
    post_2021 = set(df.loc[df["Year"] > 2021, col].dropna().unique())

    gone_after_2021[col] = sorted(list(pre_2021 - post_2021))

for label, results in new_after_2021.items():
    print(f"\nðŸ”¹ {label} â€” New after 2021:")
    print(results or "None")

for label, results in gone_after_2021.items():
    print(f"\nðŸ”¸ {label} â€” Gone after 2021:")
    print(results or "None")


ðŸ”¹ DEPARTMENT â€” New after 2021:
['METRO Metropolitan Transportation Authority', 'Mayor Office']

ðŸ”¹ DIVISION â€” New after 2021:
['Community Code Enforcement', 'Customer Account Services', 'Customer Satisfaction', 'Director Office', 'Emergency Transportation', 'Government Relationship Office', 'Houston Permitting Center', 'MCI Event', 'Mayor Mail', "Mayor's Assistance Office", 'People With Disabilities']

ðŸ”¹ NEIGHBORHOOD â€” New after 2021:
None

ðŸ”¸ DEPARTMENT â€” Gone after 2021:
None

ðŸ”¸ DIVISION â€” Gone after 2021:
None

ðŸ”¸ NEIGHBORHOOD â€” Gone after 2021:
None


In [23]:
df["RESOLUTION_TIME_DAYS"] = (
    (df["CLOSED DATE"] - df["CREATED DATE"]).dt.total_seconds() / 86400
)

df["RESOLUTION_TIME_DAYS"] = df["RESOLUTION_TIME_DAYS"].round().astype("Int64")

In [None]:
df.drop(columns=["Year"], inplace=True)

# Save cleaned data to CSV
df.to_csv("Cleaned_Houston_311.csv", index=False)

# Connect to Postgres
engine = create_engine(db_url)

# Push dataframe to Postgres
df.to_sql(
    "houston_311",
    engine,
    if_exists="replace",
    index=False
)

669