In [37]:
import pandas as pd
import numpy as np

data_path = "data/"

In [39]:
df_train = pd.read_csv(data_path + "classification_train_data.csv")
df_test = pd.read_csv(data_path + "classification_test_data.csv")
print(df_train.columns)
df_train.head(5)

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'Month', 'Day', 'Hour', 'WeekDay', 'IsWeekend',
       'Location Group', 'IsHoliday', 'lat_bin', 'lon_bin', 'TimeCategory',
       'Season'],
      dtype='object')


Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Day,Hour,WeekDay,IsWeekend,Location Group,IsHoliday,lat_bin,lon_bin,TimeCategory,Season
0,12332668,JE190281,2021-01-01 00:00:00,104XX S CENTRAL PARK AVE,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,False,...,1,0,4,False,RESIDENTIAL,False,"(41.698, 41.806]","(-87.774, -87.691]",Night,Winter
1,12300454,JE151164,2021-01-01 00:00:00,039XX W ARGYLE ST,1151,DECEPTIVE PRACTICE,ILLEGAL POSSESSION CASH CARD,APARTMENT,False,False,...,1,0,4,False,RESIDENTIAL,False,"(41.914, 42.023]","(-87.774, -87.691]",Night,Winter
2,12540683,JE412763,2021-01-01 00:00:00,052XX N LINCOLN AVE,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,OTHER (SPECIFY),False,False,...,1,0,4,False,OTHER,False,"(41.914, 42.023]","(-87.774, -87.691]",Night,Winter
3,12419052,JE296203,2021-01-01 00:00:00,025XX S CHRISTIANA AVE,1754,OFFENSE INVOLVING CHILDREN,AGGRAVATED SEXUAL ASSAULT OF CHILD BY FAMILY M...,RESIDENCE,True,True,...,1,0,4,False,RESIDENTIAL,False,"(41.806, 41.914]","(-87.774, -87.691]",Night,Winter
4,12442633,JE324967,2021-01-01 00:00:00,026XX W FITCH AVE,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,OTHER (SPECIFY),False,False,...,1,0,4,False,OTHER,False,"(41.914, 42.023]","(-87.774, -87.691]",Night,Winter


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Ensure Date column is datetime type
df_train["Date"] = pd.to_datetime(df_train["Date"])
df_test["Date"] = pd.to_datetime(df_test["Date"])
# Sort by date
df_train = df_train.sort_values("Date")
df_test = df_test.sort_values("Date")

# Select relevant features for sequence data
features = [
    "Hour",
    "Day",
    "Month",
    "WeekDay",
    "IsWeekend",
    "IsHoliday",
    "lat_bin",
    "lon_bin",
    "TimeCategory",
    "Season",
    "Location Group",
]

# Encode categorical features
categorical_features = [
    "lat_bin",
    "lon_bin",
    "TimeCategory",
    "Season",
    "Location Group",
]
for feature in categorical_features:
    le = LabelEncoder()
    df_train[feature + "_encoded"] = le.fit_transform(df_train[feature])
    df_test[feature + "_encoded"] = le.transform(df_test[feature])

# Update features list to use encoded versions
encoded_features = [
    f + "_encoded" if f in categorical_features else f for f in features
]

# Encode target variable
label_encoder = LabelEncoder()
df_train["target_encoded"] = label_encoder.fit_transform(df_train["Primary Type"])
df_test["target_encoded"] = label_encoder.transform(df_test["Primary Type"])


# Function to create sequences
def create_sequences(data, features, target, seq_length):
    X = []
    y = []

    for i in range(len(data) - seq_length):
        X.append(data[features].iloc[i : i + seq_length].values)
        y.append(data[target].iloc[i + seq_length])

    return np.array(X), np.array(y)


# Create sequences with window size of 24
sequence_length = 24
X_train, y_train = create_sequences(
    df_train, encoded_features, "target_encoded", sequence_length
)
X_test, y_test = create_sequences(
    df_test, encoded_features, "target_encoded", sequence_length
)

# One-hot encode the target variable
num_classes = len(np.unique(np.concatenate([y_train, y_test])))
y_train_categorical = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_categorical = tf.keras.utils.to_categorical(y_test, num_classes)

# Print shapes to confirm
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train_categorical.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test_categorical.shape}")