Use only the data at 2023


In [43]:
import pandas as pd
import numpy as np
import warnings

data_path = "data/"

In [44]:
df = pd.read_csv(f"{data_path}chicago_crime.csv")

In [45]:
df["Date"] = pd.to_datetime(df["Date"])
df_after = df[df["Date"].dt.year.isin([2020, 2021, 2022, 2023, 2024])]

In [46]:
df_clean = df_after.copy()
df_clean = df_clean.sort_values(by="Date")
df_clean["Date"] = pd.to_datetime(df_clean["Date"], format="%Y-%m-%dT%H:%M:%S.%f")
df_clean["Year"] = df_clean["Date"].dt.year
df_clean["Month"] = df_clean["Date"].dt.month
df_clean["Day"] = df_clean["Date"].dt.day
df_clean["Hour"] = df_clean["Date"].dt.hour
df_clean["WeekDay"] = df_clean["Date"].dt.weekday
df_clean["IsWeekend"] = (df_clean["WeekDay"] >= 5).astype(bool)
df_clean = df_clean.dropna(subset=["Longitude", "Latitude"])
df_clean["Location Description"] = df_clean["Location Description"].fillna(
    "OTHER (SPECIFY)"
)
hour_bins = [0, 6, 12, 18, 24]
hour_labels = ["Night", "Morning", "Afternoon", "Evening"]
season_bins = [0, 3, 6, 9, 12]
season_labels = ["Winter", "Spring", "Summer", "Fall"]
df_clean["TimeCategory"] = pd.cut(
    df_clean["Hour"], bins=hour_bins, labels=hour_labels, include_lowest=True
)
df_clean["Season"] = pd.cut(
    df_clean["Month"], bins=season_bins, labels=season_labels, include_lowest=True
)

In [47]:
location_mapping = {
    # Residential locations
    "RESIDENTIAL": [
        "APARTMENT",
        "RESIDENCE",
        "RESIDENCE - GARAGE",
        "RESIDENCE - PORCH / HALLWAY",
        "RESIDENCE - YARD (FRONT / BACK)",
        "HOUSE",
        "PORCH",
        "YARD",
        "GARAGE",
        "DRIVEWAY - RESIDENTIAL",
        "DRIVEWAY",
        "CHA APARTMENT",
        "CHA HALLWAY / STAIRWELL / ELEVATOR",
        "CHA HALLWAY",
        "CHA LOBBY",
        "CHA STAIRWELL",
        "CHA GROUNDS",
        "STAIRWELL",
        "BASEMENT",
        "PORCH",
        "HALLWAY",
        "VESTIBULE",
        "GANGWAY",
        "COLLEGE / UNIVERSITY - RESIDENCE HALL",
        "ROOF",
        "CHA ELEVATOR",
        "ELEVATOR",
        "RESIDENTIAL YARD (FRONT/BACK)",
        "RESIDENCE-GARAGE",
        "RESIDENCE PORCH/HALLWAY",
        "CHA HALLWAY/STAIRWELL/ELEVATOR",
        "CHA PARKING LOT/GROUNDS",
        "COLLEGE/UNIVERSITY RESIDENCE HALL",
    ],
    # Commercial retail
    "RETAIL": [
        "DEPARTMENT STORE",
        "SMALL RETAIL STORE",
        "GROCERY FOOD STORE",
        "CONVENIENCE STORE",
        "RETAIL STORE",
        "DRUG STORE",
        "APPLIANCE STORE",
        "PAWN SHOP",
        "AUTO / BOAT / RV DEALERSHIP",
        "LIQUOR STORE",
        "TAVERN / LIQUOR STORE",
        "CLEANING STORE",
        "NEWSSTAND",
        "TAVERN/LIQUOR STORE",
    ],
    # Food and entertainment
    "FOOD_ENTERTAINMENT": [
        "RESTAURANT",
        "BAR OR TAVERN",
        "TAVERN",
        "MOVIE HOUSE / THEATER",
        "SPORTS ARENA / STADIUM",
        "BOWLING ALLEY",
        "POOL ROOM",
        "CASINO/GAMBLING ESTABLISHMENT",
        "ATHLETIC CLUB",
        "CLUB",
        "BANQUET HALL",
        "MOVIE HOUSE/THEATER",
        "SPORTS ARENA/STADIUM",
    ],
    # Transportation
    "TRANSPORTATION": [
        "CTA TRAIN",
        "CTA PLATFORM",
        "CTA BUS",
        "CTA STATION",
        "CTA BUS STOP",
        "CTA TRACKS - RIGHT OF WAY",
        'CTA "L" TRAIN',
        'CTA "L" PLATFORM',
        "CTA SUBWAY STATION",
        "CTA PROPERTY",
        "CTA PARKING LOT / GARAGE / OTHER PROPERTY",
        "TAXICAB",
        "VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)",
        "VEHICLE NON-COMMERCIAL",
        "VEHICLE - COMMERCIAL",
        "VEHICLE - DELIVERY TRUCK",
        "VEHICLE - COMMERCIAL: ENTERTAINMENT / PARTY BUS",
        "VEHICLE - COMMERCIAL: TROLLEY BUS",
        "OTHER COMMERCIAL TRANSPORTATION",
        "AUTO",
        "OTHER RAILROAD PROPERTY / TRAIN DEPOT",
        "RAILROAD PROPERTY",
        "TRUCK",
        "VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)",
        "CTA GARAGE / OTHER PROPERTY",
        "OTHER RAILROAD PROP / TRAIN DEPOT",
        "VEHICLE-COMMERCIAL",
        "HIGHWAY/EXPRESSWAY",
    ],
    # Street and outdoor public areas
    "STREET_OUTDOOR": [
        "STREET",
        "SIDEWALK",
        "ALLEY",
        "VACANT LOT / LAND",
        "VACANT LOT",
        "HIGHWAY / EXPRESSWAY",
        "PARKING LOT",
        "BRIDGE",
        "PARK PROPERTY",
        "LAKEFRONT / WATERFRONT / RIVERBANK",
        "BEACH",
        "FOREST PRESERVE",
        "FARM",
        "RIVER BANK",
        "VACANT LOT/LAND",
        "LAKEFRONT/WATERFRONT/RIVERBANK",
        "LAKE",
    ],
    # Parking and vehicle related
    "PARKING": [
        "PARKING LOT / GARAGE (NON RESIDENTIAL)",
        "GAS STATION",
        "GAS STATION DRIVE/PROP.",
        "CAR WASH",
        "PARKING LOT",
        "CHA PARKING LOT / GROUNDS",
        "CHA PARKING LOT",
        "POLICE FACILITY / VEHICLE PARKING LOT",
        "AIRPORT PARKING LOT",
        "PARKING LOT/GARAGE(NON.RESID.)",
        "POLICE FACILITY/VEH PARKING LOT",
    ],
    # Financial institutions
    "FINANCIAL": [
        "BANK",
        "CURRENCY EXCHANGE",
        "ATM (AUTOMATIC TELLER MACHINE)",
        "CREDIT UNION",
        "SAVINGS AND LOAN",
    ],
    # Office and business
    "OFFICE_BUSINESS": [
        "COMMERCIAL / BUSINESS OFFICE",
        "MEDICAL / DENTAL OFFICE",
        "WAREHOUSE",
        "FACTORY / MANUFACTURING BUILDING",
        "OFFICE",
        "CONSTRUCTION SITE",
        "ANIMAL HOSPITAL",
        "FACTORY/MANUFACTURING BUILDING",
        "MEDICAL/DENTAL OFFICE",
    ],
    # Educational
    "EDUCATIONAL": [
        "SCHOOL - PUBLIC BUILDING",
        "SCHOOL - PUBLIC GROUNDS",
        "SCHOOL - PRIVATE BUILDING",
        "SCHOOL - PRIVATE GROUNDS",
        "COLLEGE / UNIVERSITY - GROUNDS",
        "LIBRARY",
        "SCHOOL YARD",
        "DAY CARE CENTER",
        "PUBLIC GRAMMAR SCHOOL",
        "SCHOOL, PUBLIC, BUILDING",
        "SCHOOL, PUBLIC, GROUNDS",
        "SCHOOL, PRIVATE, BUILDING",
        "SCHOOL, PRIVATE, GROUNDS",
        "COLLEGE/UNIVERSITY GROUNDS",
    ],
    # Government and public services
    "GOVERNMENT": [
        "GOVERNMENT BUILDING / PROPERTY",
        "FEDERAL BUILDING",
        "HOSPITAL BUILDING / GROUNDS",
        "HOSPITAL",
        "FIRE STATION",
        "POLICE FACILITY / VEHICLE PARKING LOT",
        "NURSING / RETIREMENT HOME",
        "JAIL / LOCK-UP FACILITY",
        "POLICE FACILITY",
        "GOVERNMENT BUILDING/PROPERTY",
        "HOSPITAL BUILDING/GROUNDS",
        "NURSING HOME/RETIREMENT HOME",
        "NURSING HOME",
    ],
    # Religious
    "RELIGIOUS": [
        "CHURCH / SYNAGOGUE / PLACE OF WORSHIP",
        "CHURCH/SYNAGOGUE/PLACE OF WORSHIP",
    ],
    # Airport
    "AIRPORT": [
        "AIRPORT TERMINAL LOWER LEVEL - SECURE AREA",
        "AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA",
        "AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA",
        "AIRPORT TERMINAL UPPER LEVEL - SECURE AREA",
        "AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA",
        "AIRPORT BUILDING NON-TERMINAL - SECURE AREA",
        "AIRPORT EXTERIOR - SECURE AREA",
        "AIRPORT EXTERIOR - NON-SECURE AREA",
        "AIRPORT TRANSPORTATION SYSTEM (ATS)",
        "AIRPORT VENDING ESTABLISHMENT",
        "AIRPORT/AIRCRAFT",
        "AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA",
        "AIRCRAFT",
    ],
    # Specialty retail/service
    "SPECIALTY_RETAIL": ["BARBERSHOP", "BARBER SHOP/BEAUTY SALON", "KENNEL"],
    # Water related
    "WATER": ["BOAT / WATERCRAFT"],
    # Abandoned/vacant
    "ABANDONED": ["ABANDONED BUILDING"],
    # Coin operated
    "COIN_OPERATED": ["COIN OPERATED MACHINE"],
    # Cemetery
    "CEMETERY": ["CEMETARY"],
    # Hotel
    "LODGING": [
        "HOTEL / MOTEL",
        "HOTEL",
        "MOTEL",
        "HOTEL/MOTEL",
    ],
    "OTHER (SPECIFY)": ["OTHER"],
}
# Create reverse mapping (from specific location to group)
reverse_mapping = {}
for group, locations in location_mapping.items():
    for location in locations:
        reverse_mapping[location] = group


def map_to_location_group(location):
    if location in reverse_mapping:
        return reverse_mapping[location]
    if location != "OTHER (SPECIFY)":
        warnings.warn("Unknown location:" + location)
    return "OTHER"


df_clean["Location Group"] = df_clean["Location Description"].apply(
    map_to_location_group
)
df_clean["Location Group"] = df_clean["Location Description"].apply(
    map_to_location_group
)

In [48]:
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
df_clean["IsHoliday"] = df_clean["Date"].dt.date.isin(cal.holidays()).astype(bool)

In [49]:
df_analysis = pd.DataFrame(
    {
        "Column": df_clean.columns,
        "Unique Values": df_clean.nunique().values,
        "Type": df_clean.dtypes.values,
        "Is Null": df_clean.isnull().sum().values,
    }
)
df_analysis.sort_values("Unique Values", ascending=False)

Unnamed: 0,Column,Unique Values,Type,Is Null
0,ID,1164289,int64,0
1,Case Number,1164127,object,0
2,Date,575824,datetime64[ns],0
21,Location,320728,object,0
19,Latitude,320566,float64,0
20,Longitude,320462,float64,0
16,Y Coordinate,107993,float64,0
15,X Coordinate,66283,float64,0
3,Block,35785,object,0
18,Updated On,3135,object,0


# Classification Dataset


In [50]:
# First create your year-based split
df_train = df_clean[df_clean["Year"].isin([2021, 2022, 2023])].copy()
df_test = df_clean[df_clean["Year"] == 2024].copy()
impute_values = {
    "Ward": df_train["Ward"].mode()[0],
    "Community Area": df_train["Community Area"].mode()[0],
}
for column, value in impute_values.items():
    df_train[column] = df_train[column].fillna(value)
    df_test[column] = df_test[column].fillna(value)

In [51]:
df_train_sorted_lat = df_train["Latitude"].sort_values()
lat_bin_edges = np.linspace(
    df_train_sorted_lat.min(), df_train_sorted_lat.max(), 51
)  # 51 edges for 50 bins
df_train["lat_bin"] = pd.cut(
    df_train["Latitude"], bins=lat_bin_edges, include_lowest=True
)
df_test["lat_bin"] = pd.cut(
    df_test["Latitude"], bins=lat_bin_edges, include_lowest=True
)
df_train_sorted_lon = df_train["Longitude"].sort_values()
lon_bin_edges = np.linspace(
    df_train_sorted_lon.min(), df_train_sorted_lon.max(), 51
)  # 51 edges for 50 bins
df_train["lon_bin"] = pd.cut(
    df_train["Longitude"], bins=lon_bin_edges, include_lowest=True
)
df_test["lon_bin"] = pd.cut(
    df_test["Longitude"], bins=lon_bin_edges, include_lowest=True
)

In [52]:
location_counts = df_train["Location Description"].value_counts()
threshold = round(
    len(df_train) * 0.01
)  # Keep categories that appear in at least 1% of data
print(threshold)
keep_locations = location_counts[location_counts >= threshold].index


def map_rare_categories(df, column, keep_values):
    df_copy = df.copy()
    df_copy.loc[~df_copy[column].isin(keep_values), column] = "Other"
    return df_copy


df_train = map_rare_categories(df_train, "Location Description", keep_locations)
df_test = map_rare_categories(df_test, "Location Description", keep_locations)
df_train.to_csv(f"{data_path}classification_train_data.csv", index=False)
df_test.to_csv(f"{data_path}classification_test_data.csv", index=False)

6990


In [53]:
df_train.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Day,Hour,WeekDay,IsWeekend,TimeCategory,Season,Location Group,IsHoliday,lat_bin,lon_bin
7394232,12270838,JE114337,2021-01-01,032XX W HOLLYWOOD AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1,0,4,False,Night,Winter,STREET_OUTDOOR,False,"(41.914, 42.023]","(-87.774, -87.691]"
7550954,12537851,JE441404,2021-01-01,042XX N BROADWAY,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,False,...,1,0,4,False,Night,Winter,RESIDENTIAL,False,"(41.914, 42.023]","(-87.691, -87.608]"
7585470,12427203,JE305995,2021-01-01,001XX N LAVERGNE AVE,910,MOTOR VEHICLE THEFT,AUTOMOBILE,STREET,False,False,...,1,0,4,False,Night,Winter,STREET_OUTDOOR,False,"(41.806, 41.914]","(-87.774, -87.691]"
7416083,12258978,JE100603,2021-01-01,084XX S MORGAN ST,2028,NARCOTICS,POSSESS - SYNTHETIC DRUGS,STREET,True,False,...,1,0,4,False,Night,Winter,STREET_OUTDOOR,False,"(41.698, 41.806]","(-87.691, -87.608]"
7483452,12482604,JE372460,2021-01-01,062XX S LOOMIS BLVD,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,False,...,1,0,4,False,Night,Winter,RESIDENTIAL,False,"(41.698, 41.806]","(-87.691, -87.608]"


In [54]:
df_test.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Day,Hour,WeekDay,IsWeekend,TimeCategory,Season,Location Group,IsHoliday,lat_bin,lon_bin
8112465,13486487,JH295451,2024-01-01,078XX S HALSTED ST,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,OTHER (SPECIFY),False,False,...,1,0,0,False,Night,Winter,OTHER,False,"(41.698, 41.806]","(-87.691, -87.608]"
8025799,13369774,JH153703,2024-01-01,047XX W MAYPOLE AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,APARTMENT,False,True,...,1,0,0,False,Night,Winter,RESIDENTIAL,False,"(41.806, 41.914]","(-87.774, -87.691]"
8007535,13330493,JH106572,2024-01-01,034XX W BEACH AVE,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,False,...,1,0,0,False,Night,Winter,RESIDENTIAL,False,"(41.806, 41.914]","(-87.774, -87.691]"
8231010,13683020,JH531632,2024-01-01,024XX S MARSHALL BLVD,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,Other,False,False,...,1,0,0,False,Night,Winter,EDUCATIONAL,False,"(41.806, 41.914]","(-87.774, -87.691]"
8030049,13368833,JH152568,2024-01-01,058XX S DR MARTIN LUTHER KING JR DR,1540,OBSCENITY,OBSCENE MATTER,APARTMENT,False,True,...,1,0,0,False,Night,Winter,RESIDENTIAL,False,"(41.698, 41.806]","(-87.691, -87.608]"


In [55]:
df_train.shape, df_test.shape

((699005, 33), (257320, 33))

# Time-series Dataset


In [56]:
df_time = df_clean.copy()
df_time = (
    df_time.groupby(
        ["Year", "Month", "Location Group", "District"],
        observed=False,
    )
    .size()
    .reset_index(name="crime_count")
)
month_to_season = {
    1: "Winter",
    2: "Winter",
    3: "Spring",
    4: "Spring",
    5: "Spring",
    6: "Summer",
    7: "Summer",
    8: "Summer",
    9: "Fall",
    10: "Fall",
    11: "Fall",
    12: "Winter",
}
df_time["Season"] = df_time["Month"].map(month_to_season)
df_time["sin_month"] = np.sin(2 * np.pi * (df_time["Month"] - 1) / 12)
df_time["cos_month"] = np.cos(2 * np.pi * (df_time["Month"] - 1) / 12)
# Create a time ID variable for easier shifting
df_time["time_id"] = df_time["Year"] * 12 + df_time["Month"]
for location_group, location_df in df_time.groupby(["Location Group"]):
    # Get indices for this location group
    indices = location_df.index

    # Make sure we're still working with time-ordered data
    temp_df = df_time.loc[indices].sort_values("time_id")

    # Add lag features
    df_time.loc[indices, "crime_count_lag1"] = temp_df["crime_count"].shift(1)
    df_time.loc[indices, "crime_count_lag2"] = temp_df["crime_count"].shift(2)
    df_time.loc[indices, "crime_count_lag3"] = temp_df["crime_count"].shift(3)
    df_time.loc[indices, "crime_count_lag6"] = temp_df["crime_count"].shift(6)

    # Add 12-month lag (same month previous year)
    df_time.loc[indices, "crime_count_lag12"] = temp_df["crime_count"].shift(12)

    # Moving averages
    df_time.loc[indices, "crime_count_ma3"] = (
        temp_df["crime_count"].rolling(window=3).mean().shift(1)
    )
    df_time.loc[indices, "crime_count_ma6"] = (
        temp_df["crime_count"].rolling(window=6).mean().shift(1)
    )
lag_columns = [
    "crime_count_lag1",
    "crime_count_lag2",
    "crime_count_lag3",
    "crime_count_lag6",
    "crime_count_lag12",
    "crime_count_ma3",
    "crime_count_ma6",
]

df_time.drop("time_id", axis=1, inplace=True)
df_time

Unnamed: 0,Year,Month,Location Group,District,crime_count,Season,sin_month,cos_month,crime_count_lag1,crime_count_lag2,crime_count_lag3,crime_count_lag6,crime_count_lag12,crime_count_ma3,crime_count_ma6
0,2020,1,ABANDONED,2.0,1,Winter,0.0,1.000000,,,,,,,
1,2020,1,ABANDONED,5.0,1,Winter,0.0,1.000000,1.0,,,,,,
2,2020,1,ABANDONED,6.0,1,Winter,0.0,1.000000,1.0,1.0,,,,,
3,2020,1,ABANDONED,7.0,1,Winter,0.0,1.000000,1.0,1.0,1.0,,,1.000000,
4,2020,1,ABANDONED,8.0,3,Winter,0.0,1.000000,1.0,1.0,1.0,,,1.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17708,2024,12,TRANSPORTATION,20.0,8,Winter,-0.5,0.866025,28.0,29.0,8.0,19.0,16.0,21.666667,22.166667
17709,2024,12,TRANSPORTATION,22.0,8,Winter,-0.5,0.866025,30.0,19.0,19.0,22.0,54.0,22.666667,31.333333
17710,2024,12,TRANSPORTATION,24.0,25,Winter,-0.5,0.866025,95.0,13.0,23.0,41.0,39.0,43.666667,37.166667
17711,2024,12,TRANSPORTATION,25.0,26,Winter,-0.5,0.866025,26.0,25.0,95.0,26.0,34.0,48.666667,34.666667


In [57]:
unique_years = df_clean["Year"].unique()
unique_months = range(1, 13)  # Months 1-12
unique_locations = df_clean["Location Group"].unique()
unique_districts = df_clean["District"].unique()

# Create all possible combinations
from itertools import product

all_combinations = list(
    product(unique_years, unique_months, unique_locations, unique_districts)
)
complete_grid = pd.DataFrame(
    all_combinations, columns=["Year", "Month", "Location Group", "District"]
)
df_time_complete = pd.merge(
    complete_grid,
    df_time,
    on=["Year", "Month", "Location Group", "District"],
    how="left",
)

df_time_complete["crime_count"] = df_time_complete["crime_count"].fillna(0)
df_time_complete["Season"] = df_time_complete["Month"].map(month_to_season)
df_time_complete["sin_month"] = np.sin(2 * np.pi * (df_time_complete["Month"] - 1) / 12)
df_time_complete["cos_month"] = np.cos(2 * np.pi * (df_time_complete["Month"] - 1) / 12)
df_time_complete.fillna(0, inplace=True)

In [58]:
df_time_complete.sort_values(
    by=["Year", "Month", "Location Group", "District"],
    inplace=True,
)
df_time_complete

Unnamed: 0,Year,Month,Location Group,District,crime_count,Season,sin_month,cos_month,crime_count_lag1,crime_count_lag2,crime_count_lag3,crime_count_lag6,crime_count_lag12,crime_count_ma3,crime_count_ma6
360,2020,1,ABANDONED,1.0,0.0,Winter,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
348,2020,1,ABANDONED,2.0,1.0,Winter,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
356,2020,1,ABANDONED,3.0,0.0,Winter,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
349,2020,1,ABANDONED,4.0,0.0,Winter,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
351,2020,1,ABANDONED,5.0,1.0,Winter,0.0,1.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27591,2024,12,WATER,20.0,0.0,Winter,-0.5,0.866025,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27594,2024,12,WATER,22.0,0.0,Winter,-0.5,0.866025,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27598,2024,12,WATER,24.0,0.0,Winter,-0.5,0.866025,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27582,2024,12,WATER,25.0,0.0,Winter,-0.5,0.866025,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
df_time_analysis = pd.DataFrame(
    {
        "Column": df_time_complete.columns,
        "Unique Values": df_time_complete.nunique().values,
        "Type": df_time_complete.dtypes.values,
        "Is Null": df_time_complete.isnull().sum().values,
    }
)
df_time_analysis

Unnamed: 0,Column,Unique Values,Type,Is Null
0,Year,5,int32,0
1,Month,12,int64,0
2,Location Group,20,object,0
3,District,23,float64,0
4,crime_count,618,float64,0
5,Season,4,object,0
6,sin_month,11,float64,0
7,cos_month,11,float64,0
8,crime_count_lag1,618,float64,0
9,crime_count_lag2,618,float64,0


In [60]:
df_time_train = df_time[df_time["Year"].isin([2021, 2022, 2023])].copy()
df_time_test = df_time[df_time["Year"] == 2024].copy()
df_time_train.to_csv(f"{data_path}time_series_train_data.csv", index=False)
df_time_test.to_csv(f"{data_path}time_series_test_data.csv", index=False)

In [64]:
rnn_full = df_time_complete.drop(
    [
        "crime_count_lag1",
        "crime_count_lag2",
        "crime_count_lag3",
        "crime_count_lag6",
        "crime_count_lag12",
        "crime_count_ma3",
        "crime_count_ma6",
    ],
    axis=1,
)
rnn_features = [
    "Year",
    "Month",
    "District",
    "sin_month",
    "cos_month",
]
rnn_full = rnn_full.groupby(rnn_features).agg({"crime_count": "sum"}).reset_index()
rnn_full.to_csv(f"{data_path}rnn_full_data.csv", index=False)