Use only the data at 2023


In [149]:
import pandas as pd
import numpy as np
import warnings

data_path = "data/"

In [150]:
df = pd.read_csv(f"{data_path}chicago_crime.csv")

In [151]:
df["Date"] = pd.to_datetime(df["Date"])
df_after = df[df["Date"].dt.year.isin([2020, 2021, 2022, 2023, 2024])]

In [152]:
df_clean = df_after.copy()
df_clean = df_clean.sort_values(by="Date")
df_clean["Date"] = pd.to_datetime(df_clean["Date"], format="%Y-%m-%dT%H:%M:%S.%f")
df_clean["Year"] = df_clean["Date"].dt.year
df_clean["Month"] = df_clean["Date"].dt.month
df_clean["Day"] = df_clean["Date"].dt.day
df_clean["Hour"] = df_clean["Date"].dt.hour
df_clean["WeekDay"] = df_clean["Date"].dt.weekday
df_clean["IsWeekend"] = (df_clean["WeekDay"] >= 5).astype(bool)
df_clean = df_clean.dropna(subset=["Longitude", "Latitude"])
df_clean["Location Description"] = df_clean["Location Description"].fillna(
    "OTHER (SPECIFY)"
)
hour_bins = [0, 6, 12, 18, 24]
hour_labels = ["Night", "Morning", "Afternoon", "Evening"]
season_bins = [0, 3, 6, 9, 12]
season_labels = ["Winter", "Spring", "Summer", "Fall"]
df_clean["TimeCategory"] = pd.cut(
    df_clean["Hour"], bins=hour_bins, labels=hour_labels, include_lowest=True
)
df_clean["Season"] = pd.cut(
    df_clean["Month"], bins=season_bins, labels=season_labels, include_lowest=True
)

In [153]:
location_mapping = {
    # Residential locations
    "RESIDENTIAL": [
        "APARTMENT",
        "RESIDENCE",
        "RESIDENCE - GARAGE",
        "RESIDENCE - PORCH / HALLWAY",
        "RESIDENCE - YARD (FRONT / BACK)",
        "HOUSE",
        "PORCH",
        "YARD",
        "GARAGE",
        "DRIVEWAY - RESIDENTIAL",
        "DRIVEWAY",
        "CHA APARTMENT",
        "CHA HALLWAY / STAIRWELL / ELEVATOR",
        "CHA HALLWAY",
        "CHA LOBBY",
        "CHA STAIRWELL",
        "CHA GROUNDS",
        "STAIRWELL",
        "BASEMENT",
        "PORCH",
        "HALLWAY",
        "VESTIBULE",
        "GANGWAY",
        "COLLEGE / UNIVERSITY - RESIDENCE HALL",
        "ROOF",
        "CHA ELEVATOR",
        "ELEVATOR",
        "RESIDENTIAL YARD (FRONT/BACK)",
        "RESIDENCE-GARAGE",
        "RESIDENCE PORCH/HALLWAY",
        "CHA HALLWAY/STAIRWELL/ELEVATOR",
        "CHA PARKING LOT/GROUNDS",
        "COLLEGE/UNIVERSITY RESIDENCE HALL",
    ],
    # Commercial retail
    "RETAIL": [
        "DEPARTMENT STORE",
        "SMALL RETAIL STORE",
        "GROCERY FOOD STORE",
        "CONVENIENCE STORE",
        "RETAIL STORE",
        "DRUG STORE",
        "APPLIANCE STORE",
        "PAWN SHOP",
        "AUTO / BOAT / RV DEALERSHIP",
        "LIQUOR STORE",
        "TAVERN / LIQUOR STORE",
        "CLEANING STORE",
        "NEWSSTAND",
        "TAVERN/LIQUOR STORE",
    ],
    # Food and entertainment
    "FOOD_ENTERTAINMENT": [
        "RESTAURANT",
        "BAR OR TAVERN",
        "TAVERN",
        "MOVIE HOUSE / THEATER",
        "SPORTS ARENA / STADIUM",
        "BOWLING ALLEY",
        "POOL ROOM",
        "CASINO/GAMBLING ESTABLISHMENT",
        "ATHLETIC CLUB",
        "CLUB",
        "BANQUET HALL",
        "MOVIE HOUSE/THEATER",
        "SPORTS ARENA/STADIUM",
    ],
    # Transportation
    "TRANSPORTATION": [
        "CTA TRAIN",
        "CTA PLATFORM",
        "CTA BUS",
        "CTA STATION",
        "CTA BUS STOP",
        "CTA TRACKS - RIGHT OF WAY",
        'CTA "L" TRAIN',
        'CTA "L" PLATFORM',
        "CTA SUBWAY STATION",
        "CTA PROPERTY",
        "CTA PARKING LOT / GARAGE / OTHER PROPERTY",
        "TAXICAB",
        "VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)",
        "VEHICLE NON-COMMERCIAL",
        "VEHICLE - COMMERCIAL",
        "VEHICLE - DELIVERY TRUCK",
        "VEHICLE - COMMERCIAL: ENTERTAINMENT / PARTY BUS",
        "VEHICLE - COMMERCIAL: TROLLEY BUS",
        "OTHER COMMERCIAL TRANSPORTATION",
        "AUTO",
        "OTHER RAILROAD PROPERTY / TRAIN DEPOT",
        "RAILROAD PROPERTY",
        "TRUCK",
        "VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)",
        "CTA GARAGE / OTHER PROPERTY",
        "OTHER RAILROAD PROP / TRAIN DEPOT",
        "VEHICLE-COMMERCIAL",
        "HIGHWAY/EXPRESSWAY",
    ],
    # Street and outdoor public areas
    "STREET_OUTDOOR": [
        "STREET",
        "SIDEWALK",
        "ALLEY",
        "VACANT LOT / LAND",
        "VACANT LOT",
        "HIGHWAY / EXPRESSWAY",
        "PARKING LOT",
        "BRIDGE",
        "PARK PROPERTY",
        "LAKEFRONT / WATERFRONT / RIVERBANK",
        "BEACH",
        "FOREST PRESERVE",
        "FARM",
        "RIVER BANK",
        "VACANT LOT/LAND",
        "LAKEFRONT/WATERFRONT/RIVERBANK",
        "LAKE",
    ],
    # Parking and vehicle related
    "PARKING": [
        "PARKING LOT / GARAGE (NON RESIDENTIAL)",
        "GAS STATION",
        "GAS STATION DRIVE/PROP.",
        "CAR WASH",
        "PARKING LOT",
        "CHA PARKING LOT / GROUNDS",
        "CHA PARKING LOT",
        "POLICE FACILITY / VEHICLE PARKING LOT",
        "AIRPORT PARKING LOT",
        "PARKING LOT/GARAGE(NON.RESID.)",
        "POLICE FACILITY/VEH PARKING LOT",
    ],
    # Financial institutions
    "FINANCIAL": [
        "BANK",
        "CURRENCY EXCHANGE",
        "ATM (AUTOMATIC TELLER MACHINE)",
        "CREDIT UNION",
        "SAVINGS AND LOAN",
    ],
    # Office and business
    "OFFICE_BUSINESS": [
        "COMMERCIAL / BUSINESS OFFICE",
        "MEDICAL / DENTAL OFFICE",
        "WAREHOUSE",
        "FACTORY / MANUFACTURING BUILDING",
        "OFFICE",
        "CONSTRUCTION SITE",
        "ANIMAL HOSPITAL",
        "FACTORY/MANUFACTURING BUILDING",
        "MEDICAL/DENTAL OFFICE",
    ],
    # Educational
    "EDUCATIONAL": [
        "SCHOOL - PUBLIC BUILDING",
        "SCHOOL - PUBLIC GROUNDS",
        "SCHOOL - PRIVATE BUILDING",
        "SCHOOL - PRIVATE GROUNDS",
        "COLLEGE / UNIVERSITY - GROUNDS",
        "LIBRARY",
        "SCHOOL YARD",
        "DAY CARE CENTER",
        "PUBLIC GRAMMAR SCHOOL",
        "SCHOOL, PUBLIC, BUILDING",
        "SCHOOL, PUBLIC, GROUNDS",
        "SCHOOL, PRIVATE, BUILDING",
        "SCHOOL, PRIVATE, GROUNDS",
        "COLLEGE/UNIVERSITY GROUNDS",
    ],
    # Government and public services
    "GOVERNMENT": [
        "GOVERNMENT BUILDING / PROPERTY",
        "FEDERAL BUILDING",
        "HOSPITAL BUILDING / GROUNDS",
        "HOSPITAL",
        "FIRE STATION",
        "POLICE FACILITY / VEHICLE PARKING LOT",
        "NURSING / RETIREMENT HOME",
        "JAIL / LOCK-UP FACILITY",
        "POLICE FACILITY",
        "GOVERNMENT BUILDING/PROPERTY",
        "HOSPITAL BUILDING/GROUNDS",
        "NURSING HOME/RETIREMENT HOME",
        "NURSING HOME",
    ],
    # Religious
    "RELIGIOUS": [
        "CHURCH / SYNAGOGUE / PLACE OF WORSHIP",
        "CHURCH/SYNAGOGUE/PLACE OF WORSHIP",
    ],
    # Airport
    "AIRPORT": [
        "AIRPORT TERMINAL LOWER LEVEL - SECURE AREA",
        "AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA",
        "AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA",
        "AIRPORT TERMINAL UPPER LEVEL - SECURE AREA",
        "AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA",
        "AIRPORT BUILDING NON-TERMINAL - SECURE AREA",
        "AIRPORT EXTERIOR - SECURE AREA",
        "AIRPORT EXTERIOR - NON-SECURE AREA",
        "AIRPORT TRANSPORTATION SYSTEM (ATS)",
        "AIRPORT VENDING ESTABLISHMENT",
        "AIRPORT/AIRCRAFT",
        "AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA",
        "AIRCRAFT",
    ],
    # Specialty retail/service
    "SPECIALTY_RETAIL": ["BARBERSHOP", "BARBER SHOP/BEAUTY SALON", "KENNEL"],
    # Water related
    "WATER": ["BOAT / WATERCRAFT"],
    # Abandoned/vacant
    "ABANDONED": ["ABANDONED BUILDING"],
    # Coin operated
    "COIN_OPERATED": ["COIN OPERATED MACHINE"],
    # Cemetery
    "CEMETERY": ["CEMETARY"],
    # Hotel
    "LODGING": [
        "HOTEL / MOTEL",
        "HOTEL",
        "MOTEL",
        "HOTEL/MOTEL",
    ],
    "OTHER (SPECIFY)": ["OTHER"],
}
# Create reverse mapping (from specific location to group)
reverse_mapping = {}
for group, locations in location_mapping.items():
    for location in locations:
        reverse_mapping[location] = group


def map_to_location_group(location):
    if location in reverse_mapping:
        return reverse_mapping[location]
    if location != "OTHER (SPECIFY)":
        warnings.warn("Unknown location:" + location)
    return "OTHER"


df_clean["Location Group"] = df_clean["Location Description"].apply(
    map_to_location_group
)
df_clean["Location Group"] = df_clean["Location Description"].apply(
    map_to_location_group
)

In [154]:
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
df_clean["IsHoliday"] = df_clean["Date"].dt.date.isin(cal.holidays()).astype(bool)

In [155]:
df_analysis = pd.DataFrame(
    {
        "Column": df_clean.columns,
        "Unique Values": df_clean.nunique().values,
        "Type": df_clean.dtypes.values,
        "Is Null": df_clean.isnull().sum().values,
    }
)
df_analysis.sort_values("Unique Values", ascending=False)

Unnamed: 0,Column,Unique Values,Type,Is Null
0,ID,1164289,int64,0
1,Case Number,1164127,object,0
2,Date,575824,datetime64[ns],0
21,Location,320728,object,0
19,Latitude,320566,float64,0
20,Longitude,320462,float64,0
16,Y Coordinate,107993,float64,0
15,X Coordinate,66283,float64,0
3,Block,35785,object,0
18,Updated On,3135,object,0


# Classification Dataset


In [156]:
# First create your year-based split
df_train = df_clean[df_clean["Year"].isin([2021, 2022, 2023])].copy()
df_test = df_clean[df_clean["Year"] == 2024].copy()
impute_values = {
    "Ward": df_train["Ward"].mode()[0],
    "Community Area": df_train["Community Area"].mode()[0],
}
for column, value in impute_values.items():
    df_train[column] = df_train[column].fillna(value)
    df_test[column] = df_test[column].fillna(value)

In [157]:
df_train_sorted_lat = df_train["Latitude"].sort_values()
lat_bin_edges = np.linspace(
    df_train_sorted_lat.min(), df_train_sorted_lat.max(), 51
)  # 51 edges for 50 bins
df_train["lat_bin"] = pd.cut(
    df_train["Latitude"], bins=lat_bin_edges, include_lowest=True
)
df_test["lat_bin"] = pd.cut(
    df_test["Latitude"], bins=lat_bin_edges, include_lowest=True
)
df_train_sorted_lon = df_train["Longitude"].sort_values()
lon_bin_edges = np.linspace(
    df_train_sorted_lon.min(), df_train_sorted_lon.max(), 51
)  # 51 edges for 50 bins
df_train["lon_bin"] = pd.cut(
    df_train["Longitude"], bins=lon_bin_edges, include_lowest=True
)
df_test["lon_bin"] = pd.cut(
    df_test["Longitude"], bins=lon_bin_edges, include_lowest=True
)

In [None]:
location_counts = df_train["Location Description"].value_counts()
threshold = round(
    len(df_train) * 0.01
)  # Keep categories that appear in at least 1% of data
print(threshold)
keep_locations = location_counts[location_counts >= threshold].index


def map_rare_categories(df, column, keep_values):
    df_copy = df.copy()
    df_copy.loc[~df_copy[column].isin(keep_values), column] = "Other"
    return df_copy


df_train = map_rare_categories(df_train, "Location Description", keep_locations)
df_test = map_rare_categories(df_test, "Location Description", keep_locations)
df_train.to_csv(f"{data_path}classification_train_data.csv", index=False)
df_test.to_csv(f"{data_path}classification_test_data.csv", index=False)

6990


In [183]:
df_train

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Day,Hour,WeekDay,IsWeekend,TimeCategory,Season,Location Group,IsHoliday,lat_bin,lon_bin
7394232,12270838,JE114337,2021-01-01 00:00:00,032XX W HOLLYWOOD AVE,0910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1,0,4,False,Night,Winter,STREET_OUTDOOR,False,"(41.914, 42.023]","(-87.774, -87.691]"
7550954,12537851,JE441404,2021-01-01 00:00:00,042XX N BROADWAY,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,False,...,1,0,4,False,Night,Winter,RESIDENTIAL,False,"(41.914, 42.023]","(-87.691, -87.608]"
7585470,12427203,JE305995,2021-01-01 00:00:00,001XX N LAVERGNE AVE,0910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1,0,4,False,Night,Winter,STREET_OUTDOOR,False,"(41.806, 41.914]","(-87.774, -87.691]"
7416083,12258978,JE100603,2021-01-01 00:00:00,084XX S MORGAN ST,2028,NARCOTICS,POSSESS - SYNTHETIC DRUGS,STREET,True,False,...,1,0,4,False,Night,Winter,STREET_OUTDOOR,False,"(41.698, 41.806]","(-87.691, -87.608]"
7483452,12482604,JE372460,2021-01-01 00:00:00,062XX S LOOMIS BLVD,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,False,...,1,0,4,False,Night,Winter,RESIDENTIAL,False,"(41.698, 41.806]","(-87.691, -87.608]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8004420,13327752,JH102557,2023-12-31 23:50:00,001XX W WACKER DR,0890,THEFT,FROM BUILDING,Other,False,False,...,31,23,6,True,Evening,Fall,FOOD_ENTERTAINMENT,False,"(41.806, 41.914]","(-87.691, -87.608]"
7998067,13324829,JG561343,2023-12-31 23:50:00,014XX N LOCKWOOD AVE,0454,BATTERY,"AGGRAVATED P.O. - HANDS, FISTS, FEET, NO / MIN...",STREET,False,False,...,31,23,6,True,Evening,Fall,STREET_OUTDOOR,False,"(41.806, 41.914]","(-87.774, -87.691]"
7999469,13324997,JH100010,2023-12-31 23:51:00,009XX E 77TH ST,0530,ASSAULT,AGGRAVATED - OTHER DANGEROUS WEAPON,APARTMENT,False,True,...,31,23,6,True,Evening,Fall,RESIDENTIAL,False,"(41.698, 41.806]","(-87.608, -87.525]"
8000377,13325009,JH100002,2023-12-31 23:51:00,051XX S PRINCETON AVE,0550,ASSAULT,AGGRAVATED POLICE OFFICER - HANDGUN,STREET,True,False,...,31,23,6,True,Evening,Fall,STREET_OUTDOOR,False,"(41.698, 41.806]","(-87.691, -87.608]"


In [184]:
df_test

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Day,Hour,WeekDay,IsWeekend,TimeCategory,Season,Location Group,IsHoliday,lat_bin,lon_bin
8112465,13486487,JH295451,2024-01-01 00:00:00,078XX S HALSTED ST,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,OTHER (SPECIFY),False,False,...,1,0,0,False,Night,Winter,OTHER,False,"(41.698, 41.806]","(-87.691, -87.608]"
8025799,13369774,JH153703,2024-01-01 00:00:00,047XX W MAYPOLE AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,APARTMENT,False,True,...,1,0,0,False,Night,Winter,RESIDENTIAL,False,"(41.806, 41.914]","(-87.774, -87.691]"
8007535,13330493,JH106572,2024-01-01 00:00:00,034XX W BEACH AVE,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,False,...,1,0,0,False,Night,Winter,RESIDENTIAL,False,"(41.806, 41.914]","(-87.774, -87.691]"
8231010,13683020,JH531632,2024-01-01 00:00:00,024XX S MARSHALL BLVD,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,Other,False,False,...,1,0,0,False,Night,Winter,EDUCATIONAL,False,"(41.806, 41.914]","(-87.774, -87.691]"
8030049,13368833,JH152568,2024-01-01 00:00:00,058XX S DR MARTIN LUTHER KING JR DR,1540,OBSCENITY,OBSCENE MATTER,APARTMENT,False,True,...,1,0,0,False,Night,Winter,RESIDENTIAL,False,"(41.698, 41.806]","(-87.691, -87.608]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45563,13707823,JJ100020,2024-12-31 23:50:00,012XX N MENARD AVE,0460,BATTERY,SIMPLE,SIDEWALK,False,False,...,31,23,1,False,Evening,Fall,STREET_OUTDOOR,False,"(41.806, 41.914]","(-87.774, -87.691]"
45297,13709164,JJ101392,2024-12-31 23:53:00,066XX S GREENWOOD AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,31,23,1,False,Evening,Fall,STREET_OUTDOOR,False,"(41.698, 41.806]","(-87.608, -87.525]"
45489,13708038,JJ100035,2024-12-31 23:55:00,077XX S CICERO AVE,0498,BATTERY,"AGG. DOMESTIC BATTERY - HANDS, FISTS, FEET, SE...",Other,False,True,...,31,23,1,False,Evening,Fall,LODGING,False,"(41.698, 41.806]","(-87.774, -87.691]"
45530,13707925,JJ100089,2024-12-31 23:56:00,047XX S DR MARTIN LUTHER KING JR DR,1365,CRIMINAL TRESPASS,TO RESIDENCE,APARTMENT,True,True,...,31,23,1,False,Evening,Fall,RESIDENTIAL,False,"(41.806, 41.914]","(-87.691, -87.608]"


In [170]:
df_train.shape, df_test.shape

((699005, 33), (257320, 33))

# Time-series Dataset


In [None]:
df_time = df_clean.copy()
df_time = (
    df_time.groupby(
        ["Year", "Month", "Location Group", "District", "Community Area", "Season"],
        observed=True,
    )
    .size()
    .reset_index(name="crime_count")
)
# Create a time ID variable for easier shifting
df_time["time_id"] = df_time["Year"] * 12 + df_time["Month"]
for location_group, location_df in df_time.groupby(
    ["Community Area", "Location Group"]
):
    # Get indices for this location group
    indices = location_df.index

    # Make sure we're still working with time-ordered data
    temp_df = df_time.loc[indices].sort_values("time_id")

    # Add lag features
    df_time.loc[indices, "crime_count_lag1"] = temp_df["crime_count"].shift(1)
    df_time.loc[indices, "crime_count_lag2"] = temp_df["crime_count"].shift(2)
    df_time.loc[indices, "crime_count_lag3"] = temp_df["crime_count"].shift(3)

    # Add 12-month lag (same month previous year)
    df_time.loc[indices, "crime_count_lag12"] = temp_df["crime_count"].shift(12)

    # Moving averages
    df_time.loc[indices, "crime_count_ma3"] = (
        temp_df["crime_count"].rolling(window=3).mean().shift(1)
    )
    df_time.loc[indices, "crime_count_ma6"] = (
        temp_df["crime_count"].rolling(window=6).mean().shift(1)
    )
lag_columns = [
    "crime_count_lag1",
    "crime_count_lag2",
    "crime_count_lag3",
    "crime_count_lag12",
    "crime_count_ma3",
    "crime_count_ma6",
]

df_time = df_time.dropna(subset=lag_columns)
df_time.drop("time_id", axis=1, inplace=True)

In [None]:
df_time["sin_month"] = np.sin(2 * np.pi * (df_time["Month"] - 1) / 12)
df_time["cos_month"] = np.cos(2 * np.pi * (df_time["Month"] - 1) / 12)

In [189]:
df_time[
    (df_time["Community Area"] == 1) & (df_time["Location Group"] == "RESIDENTIAL")
].head(5)

Unnamed: 0,Year,Month,Location Group,District,Community Area,Season,crime_count,crime_count_lag1,crime_count_lag2,crime_count_lag3,crime_count_lag12,crime_count_ma3,crime_count_ma6,sin_month,cos_month
12068,2020,12,RESIDENTIAL,24.0,1.0,Fall,130,1.0,120.0,129.0,127.0,83.333333,101.5,-0.5,0.8660254
13022,2021,1,RESIDENTIAL,24.0,1.0,Winter,118,130.0,1.0,120.0,133.0,83.666667,103.5,0.0,1.0
13962,2021,2,RESIDENTIAL,24.0,1.0,Winter,101,118.0,130.0,1.0,119.0,83.0,102.5,0.5,0.8660254
14944,2021,3,RESIDENTIAL,24.0,1.0,Winter,118,101.0,118.0,130.0,95.0,116.333333,99.833333,0.866025,0.5
15920,2021,4,RESIDENTIAL,24.0,1.0,Spring,130,118.0,101.0,118.0,124.0,112.333333,98.0,1.0,6.123234000000001e-17


In [190]:
df_time_analysis = pd.DataFrame(
    {
        "Column": df_time.columns,
        "Unique Values": df_time.nunique().values,
        "Type": df_time.dtypes.values,
        "Is Null": df_time.isnull().sum().values,
    }
)
df_time_analysis

Unnamed: 0,Column,Unique Values,Type,Is Null
0,Year,5,int32,0
1,Month,12,int32,0
2,Location Group,18,object,0
3,District,23,float64,0
4,Community Area,77,float64,0
5,Season,4,category,0
6,crime_count,334,int64,0
7,crime_count_lag1,334,float64,0
8,crime_count_lag2,334,float64,0
9,crime_count_lag3,333,float64,0


In [191]:
df_time_train = df_time[df_time["Year"].isin([2021, 2022, 2023])].copy()
df_time_test = df_time[df_time["Year"] == 2024].copy()
df_time_train.to_csv(f"{data_path}time_series_train_data.csv", index=False)
df_time_test.to_csv(f"{data_path}time_series_test_data.csv", index=False)