In [227]:
import pandas as pd
import re
import numpy as np

In [228]:
prev_year_path = "AL-2022"
to_predict_path = "AL-2023\\AL-2023\\AL"
output_prefix = re.search(r'\w+-\d+', to_predict_path).group()
predict_year = int(re.search(r'\d+', output_prefix).group())

In [229]:
incident_2022 = pd.read_csv(f"{prev_year_path}\\NIBRS_incident.csv")
offense_2022 = pd.read_csv(f"{prev_year_path}\\NIBRS_OFFENSE.csv")
print(len(incident_2022))
incident_2023 = pd.read_csv(f"{to_predict_path}\\NIBRS_incident.csv")
offense_2023 = pd.read_csv(f"{to_predict_path}\\NIBRS_OFFENSE.csv")
print(len(incident_2023), len(offense_2023))
incident = pd.concat([incident_2022,incident_2023], axis=0)
offense = pd.concat([offense_2022, offense_2023], axis=0)
print(len(incident))
crime_by_location = incident.merge(offense, on = ['data_year','incident_id'])
print(len(crime_by_location),crime_by_location.isna().sum())
print(crime_by_location.head())
print(incident_2023['incident_id'].duplicated().sum())
print(crime_by_location['incident_id'].duplicated().sum())
print(offense_2023['incident_id'].duplicated().sum())

214804
213114 247643
427918
498675 data_year                     0
agency_id                     0
incident_id                   0
nibrs_month_id                0
cargo_theft_flag              0
submission_date               0
incident_date                 0
report_date_flag              0
incident_hour                 0
cleared_except_id             0
cleared_except_date      460500
incident_status               0
data_home                498675
orig_format                   0
did                           0
offense_id                    0
offense_code                  0
attempt_complete_flag         0
location_id                   0
num_premises_entered     497120
method_entry_code        475305
dtype: int64
   data_year  agency_id  incident_id  nibrs_month_id cargo_theft_flag  \
0       2022         49    155372673        44169977                f   
1       2022         49    155372673        44169977                f   
2       2022         49    155372673        44169977         

In [230]:
crime_by_location['incident_date'] = pd.to_datetime(crime_by_location['incident_date'], errors='coerce')
crime_by_location = crime_by_location[
    (
        crime_by_location["incident_date"].dt.year==predict_year)
        | (
            (crime_by_location["incident_date"].dt.month.isin([9,10,11,12])) & (crime_by_location["incident_date"].dt.year == predict_year-1)
    )
]
crime_by_location['month'] = crime_by_location['incident_date'].dt.month
crime_by_location['day'] = crime_by_location['incident_date'].dt.day
print(crime_by_location[['incident_date', 'month']].head())

   incident_date  month
4     2022-12-26     12
5     2022-12-16     12
7     2022-12-01     12
8     2022-12-01     12
12    2022-11-28     11


In [231]:
crime_dropna = crime_by_location.drop(columns = ['data_home','num_premises_entered', 'method_entry_code', 'cleared_except_date'])

In [232]:
crime_dropna.isnull().sum()

data_year                0
agency_id                0
incident_id              0
nibrs_month_id           0
cargo_theft_flag         0
submission_date          0
incident_date            0
report_date_flag         0
incident_hour            0
cleared_except_id        0
incident_status          0
orig_format              0
did                      0
offense_id               0
offense_code             0
attempt_complete_flag    0
location_id              0
month                    0
day                      0
dtype: int64

In [233]:
crime_featured = crime_dropna.copy()
crime_featured["WeekDay"] = crime_featured["incident_date"].dt.weekday
crime_featured["IsWeekDay"] = (crime_featured["WeekDay"] < 5).astype(bool)
crime_featured["Isweekend"] = (crime_featured["WeekDay"] >= 5).astype(bool)
hour_bins = [0, 6, 12, 18, 24]
hour_labels = ["Night", "Morning", "Afternoon", "Evening"]
season_bins = [0, 3, 6, 9, 12]
season_labels = ["Winter", "Spring", "Summer", "Fall"]
crime_featured["Season"] = pd.cut(
    crime_featured["month"], bins=season_bins, labels=season_labels, include_lowest=True
)
crime_featured["TimeCategory"] = pd.cut(
    crime_featured["incident_hour"], bins=hour_bins, labels=hour_labels, include_lowest=True
)
time_dummies = pd.get_dummies(crime_featured["TimeCategory"])
crime_featured = pd.concat([crime_featured, time_dummies], axis=1)


In [234]:
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()

start_date = crime_featured["incident_date"].min()
end_date = crime_featured["incident_date"].max()

# Get holidays between these dates
holidays = cal.holidays(start=start_date, end=end_date)

# Check if each date is a holiday
crime_featured["IsHoliday"] = crime_featured["incident_date"].dt.date.isin(holidays.date).astype(int)

In [235]:
import calendar

crime_timed = crime_featured.copy()
crime_timed["crime_count"] = 1
holiday_calendar = (
    crime_timed[crime_timed["IsHoliday"] == 1]
    .drop_duplicates(subset=["data_year", "month", "day"])  # count each holiday once per day
    .groupby(["data_year", "month"], as_index=False)
    .agg(holiday_num=("day", "count"))
)

In [236]:
# Group by year and month and calculate aggregate values
aggregated_crime = crime_timed.groupby(['data_year','month','location_id'],as_index=False,
    observed=False,).agg(
    crime_count=("crime_count", "sum"),
    weekday=("IsWeekDay", "sum"),
    weekend=("Isweekend", "sum"),
    morning=("Morning", "sum"),
    afternoon=("Afternoon", "sum"),
    evening=("Evening", "sum"),
    night=("Night", "sum"),
   )
aggregated_crime = pd.merge(aggregated_crime, holiday_calendar, on=["data_year", "month"], how="left")
aggregated_crime["holiday_num"] = aggregated_crime["holiday_num"].fillna(0).astype(int)
aggregated_crime["num_days"] = aggregated_crime.apply(
    lambda row: calendar.monthrange(row["data_year"], row["month"])[1], axis=1
)
ls_features = [
    "morning",
    "afternoon",
    "evening",
    "night",
    # "holiday",
    # "weekday",
    # "weekend",
]
for feature in ls_features:
    aggregated_crime[f"{feature}_rate"] = aggregated_crime[feature] / aggregated_crime["crime_count"]
aggregated_crime["sin_month"] = np.sin(2 * np.pi * (aggregated_crime["month"] - 1) / 12)
aggregated_crime["cos_month"] = np.cos(2 * np.pi * (aggregated_crime["month"] - 1) / 12)
# Create a time ID variable for easier shifting
aggregated_crime["time_id"] = aggregated_crime["data_year"] * 12 + aggregated_crime["month"]

print(aggregated_crime)

     data_year  month  location_id  crime_count  weekday  weekend  morning  \
0         2022      9            1           18       10        8        8   
1         2022      9            2           19       18        1        6   
2         2022      9            3            6        4        2        3   
3         2022      9            4           22       13        9        1   
4         2022      9            5           10        9        1        1   
..         ...    ...          ...          ...      ...      ...      ...   
715       2023     12           41          258      184       74       49   
716       2023     12           42            2        2        0        1   
717       2023     12           43           47       24       23        5   
718       2023     12           44          270      188       82       81   
719       2023     12           98          758      570      188      263   

     afternoon  evening  night  holiday_num  num_days  morning_

In [237]:
# Sort to prepare for calculation
crime_predict = aggregated_crime.sort_values(['location_id','time_id']).reset_index(drop=True)

# Calculate crime increase rate compared to previous month
crime_predict['crime_increase'] = crime_predict.groupby('location_id')['crime_count'].pct_change()

# Create shifted features
crime_predict['prev_1_crime_increase'] = crime_predict['crime_increase'].shift(1)
crime_predict['prev_2_crime_increase'] = crime_predict['crime_increase'].shift(2)
crime_predict['prev_3_crime_increase'] = crime_predict['crime_increase'].shift(3)

# Calculate average of previous 3 months' increases
crime_predict['avg_crime_increase'] = crime_predict[
    ['prev_1_crime_increase', 'prev_2_crime_increase', 'prev_3_crime_increase']
].mean(axis=1)

# Filter for only needed data
features = crime_predict[crime_predict['data_year'] == predict_year].copy()
to_predict = features.drop(columns=['crime_increase'])
print("length before drop,",len(to_predict))
to_predict = to_predict.dropna()
print("length after drop,",len(to_predict))
to_predict.to_csv(f"{output_prefix}_to_predict.csv", index=False)

length before drop, 541
length after drop, 536


In [238]:
to_predict.isnull().sum()

data_year                0
month                    0
location_id              0
crime_count              0
weekday                  0
weekend                  0
morning                  0
afternoon                0
evening                  0
night                    0
holiday_num              0
num_days                 0
morning_rate             0
afternoon_rate           0
evening_rate             0
night_rate               0
sin_month                0
cos_month                0
time_id                  0
prev_1_crime_increase    0
prev_2_crime_increase    0
prev_3_crime_increase    0
avg_crime_increase       0
dtype: int64