Use only the data at 2023


In [87]:
import pandas as pd
import numpy as np
import warnings

data_path = "data/"

In [88]:
df = pd.read_csv(f"{data_path}chicago_crime.csv")

In [89]:
df["Date"] = pd.to_datetime(df["Date"])
df_after = df[df["Date"].dt.year.isin([2021, 2022, 2023, 2024])]

In [90]:
df_clean = df_after.copy()
df_clean = df_clean.sort_values(by="Date")
df_clean["Date"] = pd.to_datetime(df_clean["Date"], format="%Y-%m-%dT%H:%M:%S.%f")
df_clean["Year"] = df_clean["Date"].dt.year
df_clean["Month"] = df_clean["Date"].dt.month
df_clean["Day"] = df_clean["Date"].dt.day
df_clean["Hour"] = df_clean["Date"].dt.hour
df_clean["WeekDay"] = df_clean["Date"].dt.weekday
df_clean["IsWeekend"] = (df_clean["WeekDay"] >= 5).astype(bool)
df_clean = df_clean.dropna(subset=["Longitude", "Latitude"])
df_clean["Location Description"] = df_clean["Location Description"].fillna(
    "OTHER (SPECIFY)"
)

In [91]:
location_mapping = {
    # Residential locations
    "RESIDENTIAL": [
        "APARTMENT",
        "RESIDENCE",
        "RESIDENCE - GARAGE",
        "RESIDENCE - PORCH / HALLWAY",
        "RESIDENCE - YARD (FRONT / BACK)",
        "HOUSE",
        "PORCH",
        "YARD",
        "GARAGE",
        "DRIVEWAY - RESIDENTIAL",
        "DRIVEWAY",
        "CHA APARTMENT",
        "CHA HALLWAY / STAIRWELL / ELEVATOR",
        "CHA HALLWAY",
        "CHA LOBBY",
        "CHA STAIRWELL",
        "CHA GROUNDS",
        "STAIRWELL",
        "BASEMENT",
        "PORCH",
        "HALLWAY",
        "VESTIBULE",
        "GANGWAY",
        "COLLEGE / UNIVERSITY - RESIDENCE HALL",
        "ROOF",
        "CHA ELEVATOR",  # Added
        "ELEVATOR",  # Added
    ],
    # Commercial retail
    "RETAIL": [
        "DEPARTMENT STORE",
        "SMALL RETAIL STORE",
        "GROCERY FOOD STORE",
        "CONVENIENCE STORE",
        "RETAIL STORE",
        "DRUG STORE",
        "APPLIANCE STORE",
        "PAWN SHOP",
        "AUTO / BOAT / RV DEALERSHIP",
        "LIQUOR STORE",
        "TAVERN / LIQUOR STORE",
        "CLEANING STORE",
        "NEWSSTAND",
    ],
    # Food and entertainment
    "FOOD_ENTERTAINMENT": [
        "RESTAURANT",
        "BAR OR TAVERN",
        "TAVERN",
        "MOVIE HOUSE / THEATER",
        "SPORTS ARENA / STADIUM",
        "BOWLING ALLEY",
        "POOL ROOM",
        "CASINO/GAMBLING ESTABLISHMENT",
        "ATHLETIC CLUB",
        "CLUB",  # Added
        "BANQUET HALL",  # Added
    ],
    # Transportation
    "TRANSPORTATION": [
        "CTA TRAIN",
        "CTA PLATFORM",
        "CTA BUS",
        "CTA STATION",
        "CTA BUS STOP",
        "CTA TRACKS - RIGHT OF WAY",
        'CTA "L" TRAIN',
        'CTA "L" PLATFORM',
        "CTA SUBWAY STATION",
        "CTA PROPERTY",
        "CTA PARKING LOT / GARAGE / OTHER PROPERTY",
        "TAXICAB",
        "VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)",
        "VEHICLE NON-COMMERCIAL",
        "VEHICLE - COMMERCIAL",
        "VEHICLE - DELIVERY TRUCK",
        "VEHICLE - COMMERCIAL: ENTERTAINMENT / PARTY BUS",
        "VEHICLE - COMMERCIAL: TROLLEY BUS",
        "OTHER COMMERCIAL TRANSPORTATION",
        "AUTO",
        "OTHER RAILROAD PROPERTY / TRAIN DEPOT",
        "RAILROAD PROPERTY",
        "TRUCK",  # Added
    ],
    # Street and outdoor public areas
    "STREET_OUTDOOR": [
        "STREET",
        "SIDEWALK",
        "ALLEY",
        "VACANT LOT / LAND",
        "VACANT LOT",
        "HIGHWAY / EXPRESSWAY",
        "PARKING LOT",
        "BRIDGE",
        "PARK PROPERTY",
        "LAKEFRONT / WATERFRONT / RIVERBANK",
        "BEACH",
        "FOREST PRESERVE",
        "FARM",
        "RIVER BANK",  # Added
    ],
    # Parking and vehicle related
    "PARKING": [
        "PARKING LOT / GARAGE (NON RESIDENTIAL)",
        "GAS STATION",
        "GAS STATION DRIVE/PROP.",
        "CAR WASH",
        "PARKING LOT",
        "CHA PARKING LOT / GROUNDS",
        "CHA PARKING LOT",
        "POLICE FACILITY / VEHICLE PARKING LOT",
        "AIRPORT PARKING LOT",
    ],
    # Financial institutions
    "FINANCIAL": [
        "BANK",
        "CURRENCY EXCHANGE",
        "ATM (AUTOMATIC TELLER MACHINE)",
        "CREDIT UNION",
        "SAVINGS AND LOAN",
    ],
    # Office and business
    "OFFICE_BUSINESS": [
        "COMMERCIAL / BUSINESS OFFICE",
        "MEDICAL / DENTAL OFFICE",
        "WAREHOUSE",
        "FACTORY / MANUFACTURING BUILDING",
        "OFFICE",
        "CONSTRUCTION SITE",
        "ANIMAL HOSPITAL",
    ],
    # Educational
    "EDUCATIONAL": [
        "SCHOOL - PUBLIC BUILDING",
        "SCHOOL - PUBLIC GROUNDS",
        "SCHOOL - PRIVATE BUILDING",
        "SCHOOL - PRIVATE GROUNDS",
        "COLLEGE / UNIVERSITY - GROUNDS",
        "LIBRARY",
        "SCHOOL YARD",
        "DAY CARE CENTER",
        "PUBLIC GRAMMAR SCHOOL",  # Added
    ],
    # Government and public services
    "GOVERNMENT": [
        "GOVERNMENT BUILDING / PROPERTY",
        "FEDERAL BUILDING",
        "HOSPITAL BUILDING / GROUNDS",
        "HOSPITAL",
        "FIRE STATION",
        "POLICE FACILITY / VEHICLE PARKING LOT",
        "NURSING / RETIREMENT HOME",
        "JAIL / LOCK-UP FACILITY",
        "POLICE FACILITY",  # Added
    ],
    # Religious
    "RELIGIOUS": ["CHURCH / SYNAGOGUE / PLACE OF WORSHIP"],
    # Airport
    "AIRPORT": [
        "AIRPORT TERMINAL LOWER LEVEL - SECURE AREA",
        "AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA",
        "AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA",
        "AIRPORT TERMINAL UPPER LEVEL - SECURE AREA",
        "AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA",
        "AIRPORT BUILDING NON-TERMINAL - SECURE AREA",
        "AIRPORT EXTERIOR - SECURE AREA",
        "AIRPORT EXTERIOR - NON-SECURE AREA",
        "AIRPORT TRANSPORTATION SYSTEM (ATS)",
        "AIRPORT VENDING ESTABLISHMENT",
        "AIRPORT/AIRCRAFT",
        "AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA",
        "AIRCRAFT",
    ],
    # Specialty retail/service
    "SPECIALTY_RETAIL": ["BARBERSHOP", "BARBER SHOP/BEAUTY SALON", "KENNEL"],
    # Water related
    "WATER": ["BOAT / WATERCRAFT"],
    # Abandoned/vacant
    "ABANDONED": ["ABANDONED BUILDING"],
    # Coin operated
    "COIN_OPERATED": ["COIN OPERATED MACHINE"],
    # Cemetery
    "CEMETERY": ["CEMETARY"],
    # Hotel
    "LODGING": ["HOTEL / MOTEL", "HOTEL", "MOTEL"],  # Added MOTEL
    "OTHER (SPECIFY)": ["OTHER"],
}
# Create reverse mapping (from specific location to group)
reverse_mapping = {}
for group, locations in location_mapping.items():
    for location in locations:
        reverse_mapping[location] = group


def map_to_location_group(location):
    if location in reverse_mapping:
        return reverse_mapping[location]
    if location != "OTHER (SPECIFY)":
        warnings.warn("Unknown location:" + location)
    return "OTHER"


df_clean["Location Group"] = df_clean["Location Description"].apply(
    map_to_location_group
)
df_clean["Location Group"] = df_clean["Location Description"].apply(
    map_to_location_group
)

In [92]:
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
df_clean["IsHoliday"] = df_clean["Date"].dt.date.isin(cal.holidays()).astype(bool)

In [93]:
df_analysis = pd.DataFrame(
    {
        "Column": df_clean.columns,
        "Unique Values": df_clean.nunique().values,
        "Type": df_clean.dtypes.values,
        "Is Null": df_clean.isnull().sum().values,
    }
)
df_analysis.sort_values("Unique Values", ascending=False)

Unnamed: 0,Column,Unique Values,Type,Is Null
0,ID,956325,int64,0
1,Case Number,956193,object,0
2,Date,467709,datetime64[ns],0
21,Location,287521,object,0
19,Latitude,287389,float64,0
20,Longitude,287303,float64,0
16,Y Coordinate,104195,float64,0
15,X Coordinate,64747,float64,0
3,Block,34899,object,0
18,Updated On,2430,object,0


In [None]:
# First create your year-based split
df_train = df_clean[df_clean["Year"].isin([2021, 2022, 2023])].copy()
df_test = df_clean[df_clean["Year"] == 2024].copy()
impute_values = {
    "Ward": df_train["Ward"].mode()[0],
    "Community Area": df_train["Community Area"].mode()[0],
}
for column, value in impute_values.items():
    df_train[column] = df_train[column].fillna(value)
    df_test[column] = df_test[column].fillna(value)

In [95]:
df_train_sorted_lat = df_train["Latitude"].sort_values()
lat_bin_edges = np.linspace(
    df_train_sorted_lat.min(), df_train_sorted_lat.max(), 51
)  # 51 edges for 50 bins
df_train["lat_bin"] = pd.cut(
    df_train["Latitude"], bins=lat_bin_edges, include_lowest=True
)
df_test["lat_bin"] = pd.cut(
    df_test["Latitude"], bins=lat_bin_edges, include_lowest=True
)
df_train_sorted_lon = df_train["Longitude"].sort_values()
lon_bin_edges = np.linspace(
    df_train_sorted_lon.min(), df_train_sorted_lon.max(), 51
)  # 51 edges for 50 bins
df_train["lon_bin"] = pd.cut(
    df_train["Longitude"], bins=lon_bin_edges, include_lowest=True
)
df_test["lon_bin"] = pd.cut(
    df_test["Longitude"], bins=lon_bin_edges, include_lowest=True
)

In [96]:
# Create time of day and season features
hour_bins = [0, 6, 12, 18, 24]
hour_labels = ["Night", "Morning", "Afternoon", "Evening"]
season_bins = [0, 3, 6, 9, 12]
season_labels = ["Winter", "Spring", "Summer", "Fall"]

df_train["TimeCategory"] = pd.cut(
    df_train["Hour"], bins=hour_bins, labels=hour_labels, include_lowest=True
)
df_test["TimeCategory"] = pd.cut(
    df_test["Hour"], bins=hour_bins, labels=hour_labels, include_lowest=True
)
df_train["Season"] = pd.cut(
    df_train["Month"], bins=season_bins, labels=season_labels, include_lowest=True
)
df_test["Season"] = pd.cut(
    df_test["Month"], bins=season_bins, labels=season_labels, include_lowest=True
)

In [97]:
location_counts = df_train["Location Description"].value_counts()
threshold = len(df_train) * 0.01  # Keep categories that appear in at least 1% of data
keep_locations = location_counts[location_counts >= threshold].index


def map_rare_categories(df, column, keep_values):
    df_copy = df.copy()
    df_copy.loc[~df_copy[column].isin(keep_values), column] = "Other"
    return df_copy


df_train = map_rare_categories(df_train, "Location Description", keep_locations)
df_test = map_rare_categories(df_test, "Location Description", keep_locations)
df_train.to_csv(f"{data_path}classification_train_data.csv", index=False)
df_test.to_csv(f"{data_path}classification_test_data.csv", index=False)

In [98]:
df_train.tail()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Day,Hour,WeekDay,IsWeekend,Location Group,IsHoliday,lat_bin,lon_bin,TimeCategory,Season
7996689,13324881,JH100006,2023-12-31 23:50:00,051XX S WASHTENAW AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,...,31,23,6,True,RESIDENTIAL,False,"(41.698, 41.806]","(-87.774, -87.691]",Evening,Fall
8004420,13327752,JH102557,2023-12-31 23:50:00,001XX W WACKER DR,890,THEFT,FROM BUILDING,Other,False,False,...,31,23,6,True,FOOD_ENTERTAINMENT,False,"(41.806, 41.914]","(-87.691, -87.608]",Evening,Fall
8000377,13325009,JH100002,2023-12-31 23:51:00,051XX S PRINCETON AVE,550,ASSAULT,AGGRAVATED POLICE OFFICER - HANDGUN,STREET,True,False,...,31,23,6,True,STREET_OUTDOOR,False,"(41.698, 41.806]","(-87.691, -87.608]",Evening,Fall
7999469,13324997,JH100010,2023-12-31 23:51:00,009XX E 77TH ST,530,ASSAULT,AGGRAVATED - OTHER DANGEROUS WEAPON,APARTMENT,False,True,...,31,23,6,True,RESIDENTIAL,False,"(41.698, 41.806]","(-87.608, -87.525]",Evening,Fall
8005196,13327763,JH103488,2023-12-31 23:59:00,010XX N ORLEANS ST,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,31,23,6,True,STREET_OUTDOOR,False,"(41.806, 41.914]","(-87.691, -87.608]",Evening,Fall


In [99]:
df_test.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Day,Hour,WeekDay,IsWeekend,Location Group,IsHoliday,lat_bin,lon_bin,TimeCategory,Season
8030049,13368833,JH152568,2024-01-01,058XX S DR MARTIN LUTHER KING JR DR,1540,OBSCENITY,OBSCENE MATTER,APARTMENT,False,True,...,1,0,0,False,RESIDENTIAL,False,"(41.698, 41.806]","(-87.691, -87.608]",Night,Winter
7999281,13325580,JH100987,2024-01-01,034XX W FRANKLIN BLVD,890,THEFT,FROM BUILDING,RESIDENCE,False,False,...,1,0,0,False,RESIDENTIAL,False,"(41.806, 41.914]","(-87.774, -87.691]",Night,Winter
8000033,13326163,JH101560,2024-01-01,040XX S DREXEL BLVD,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,1,0,0,False,STREET_OUTDOOR,False,"(41.806, 41.914]","(-87.608, -87.525]",Night,Winter
8058863,13414780,JH207578,2024-01-01,024XX N LUNA AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,...,1,0,0,False,RESIDENTIAL,False,"(41.914, 42.023]","(-87.774, -87.691]",Night,Winter
8017589,13357447,JH138928,2024-01-01,070XX S PEORIA ST,820,THEFT,$500 AND UNDER,RESIDENCE,False,False,...,1,0,0,False,RESIDENTIAL,False,"(41.698, 41.806]","(-87.691, -87.608]",Night,Winter


In [100]:
df_train.shape, df_test.shape

((699005, 33), (257320, 33))