In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

## Target Variables

- **Primary Type**: Crime category classification
- **Arrest**: Binary prediction of whether an arrest will be made
- **Domestic**: Binary prediction of whether incident is domestic-related
- **Crime Count**: Number of crimes in a specific area/time window
- **Crime Probability**: Likelihood of a specific crime type occurring

## Feature Variables

### Temporal Features

- Year, Month, Day of Month
- Day of Week, Weekday
- Hour of Day, TimeCategory (Morning/Afternoon/Evening/Night)
- Season
- Holiday indicators

### Spatial Features

- Latitude, Longitude
- lat_bin, lon_bin (discretized location)
- Community Area, District, Ward, Beat
- Location Group

### Context Features

- Previous crime patterns in the area
- Aggregated statistics (crime counts by type in preceding time windows)
- Socioeconomic indicators by area (if available)

## LSTM Approach for Crime Prediction

### Sequence Construction

For LSTM models, structure your data as sequences where each element represents a time window (day, shift, or hour) with relevant features:

1. **Time-based sequences**:

   - Fixed areas with sequences of crime activity over time
   - Example: 7-day sequences of crime counts for each community area

2. **Area-based sequences**:

   - Fixed time with sequences of adjacent spatial areas
   - Example: Sequence of crime counts across adjacent beats for a given day

3. **Spatiotemporal grid**:
   - Divide the city into grid cells
   - For each cell, create a time series of crime counts/types

### LSTM-Specific Feature Engineering

- **Temporal aggregations**:

  - Rolling windows of crime counts (previous 1, 3, 7, 30 days)
  - Crime type distribution in previous time periods
  - Trend indicators (increasing/decreasing crime rates)

- **Spatial context**:

  - Crime counts in neighboring areas
  - Distance to known hotspots
  - Point of interest density

- **Crime dynamics**:
  - Time since last similar crime in the area
  - Seasonal crime patterns
  - Weekly patterns (day of week effects)

### Recommended LSTM Implementations

1. **Many-to-One LSTM**:

   - Input: Sequence of past days/shifts for each area
   - Output: Prediction for next time period

2. **Encoder-Decoder LSTM**:

   - Input: Sequence of past crime patterns
   - Output: Sequence of future predictions (multi-day forecast)

3. **ConvLSTM**:

   - Combines CNN for spatial patterns with LSTM for temporal patterns
   - Treats the city as an image where each pixel represents crime intensity

4. **Attention-based LSTM**:
   - Helps model determine which past time periods are most relevant for prediction
   - Useful for capturing long-term dependencies and seasonal patterns


In [None]:
data_path = "data/"
df_train = pd.read_csv(f"{data_path}classification_train_data.csv")
df_test = pd.read_csv(f"{data_path}classification_test_data.csv")
df_train = df_train.sample(frac=0.1, random_state=42)

In [61]:
df_analysis = pd.DataFrame(
    {
        "train_nunique": df_train.nunique(),
        "train_null": df_train.isnull().sum(),
        "test_nunique": df_test.nunique(),
        "test_null": df_test.isnull().sum(),
        "train_dtype": df_train.dtypes,
        "test_dtype": df_test.dtypes,
    }
)
df_analysis

Unnamed: 0,train_nunique,train_null,test_nunique,test_null,train_dtype,test_dtype
ID,20000,0,40000,0,int64,int64
Case Number,20000,0,39999,0,object,object
Date,18749,0,31035,0,object,object
Block,11252,0,15938,0,object,object
IUCR,227,0,269,0,object,object
Primary Type,27,0,29,0,object,object
Description,214,0,249,0,object,object
Location Description,17,0,17,0,object,object
Arrest,2,0,2,0,bool,bool
Domestic,2,0,2,0,bool,bool


In [None]:
bin_features = ["IsWeekend"]
num_features = [
    "Beat",
    "Ward",
    "Community Area",
    "Year",
    "Month",
    "Day",
    "Hour",
    "WeekDay",
]
cat_features = [
    "Location Group",
    "lat_bin",
    "lon_bin",
    "TimeCategory",
    "District",
    "Season",
]
print(bin_features)
print(num_features)
print(cat_features)

['IsWeekday']
['Beat', 'Ward', 'Community Area', 'Year', 'Month', 'Day', 'Hour', 'WeekDay']
['Location Group', 'lat_bin', 'lon_bin', 'TimeCategory', 'District', 'Season']


In [None]:
cat_pipe = Pipeline(
    [
        ("onehot", OneHotEncoder(sparse_output=False)),
        ("pca", PCA(n_components=70, random_state=42)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat",
            # OneHotEncoder(sparse_output=False),
            cat_pipe,
            cat_features,
        ),
        ("bin", "passthrough", bin_features),
        ("num", StandardScaler(), num_features),
    ]
)
# Fit on training data only
X_train = preprocessor.fit_transform(df_train)
y_train = df_train["Domestic"]

# Transform test data using the fitted preprocessor
X_test = preprocessor.transform(df_test)
y_test = df_test["Domestic"]

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

In [65]:
rfc_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_model.fit(X_train, y_train)
y_pred = rfc_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.794525

Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.91      0.88     32578
        True       0.42      0.28      0.34      7422

    accuracy                           0.79     40000
   macro avg       0.63      0.60      0.61     40000
weighted avg       0.77      0.79      0.78     40000



In [66]:
lrc_model = LogisticRegression(max_iter=1000, random_state=42)
lrc_model.fit(X_train, y_train)
y_pred = lrc_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.81575

Classification Report:
              precision    recall  f1-score   support

       False       0.82      0.99      0.90     32578
        True       0.53      0.07      0.12      7422

    accuracy                           0.82     40000
   macro avg       0.68      0.53      0.51     40000
weighted avg       0.77      0.82      0.75     40000



In [67]:
# xgb_model = GradientBoostingClassifier(random_state=42)
# xgb_model.fit(X_train, y_train)
# y_pred = xgb_model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))