In [11]:
################### standard ###################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

################### allow import from src ###################
import sys, os
sys.path.append(os.path.abspath(".."))

################### pipeline step ###################
import src.data_prep as data
import src.models.linear_reg as linear
import src.config.feature_def as feature_def
import src.models.utils as utils

In [12]:
monthly = data.prepare_monthly_dataset("../data/Major_Crime_Indicators_Open_Data.csv")

################ define features ################
feature_cols = feature_def.FEATURE_NN_COLS
target_col = feature_def.TARGET_COL

# Debug: check what features we're using
print(f"Using {len(feature_cols)} features: {feature_cols}")
print(f"Target column: {target_col}")
print(f"Available columns in monthly: {list(monthly.columns)}")

Using 9 features: ['NEIGHBOURHOOD_158', 'REPORT_YEAR', 'REPORT_MONTH', 'Prev_Month_NSI', 'NSI_3M_Avg', 'x', 'y', 'Crime_Count', 'TotalCrimeScore']
Target column: NSI
Available columns in monthly: ['NEIGHBOURHOOD_158', 'REPORT_YEAR', 'REPORT_MONTH', 'TotalCrimeScore', 'Crime_Count', 'x', 'y', 'NSI', 'Prev_Month_NSI', 'NSI_3M_Avg']


In [13]:
display(monthly)
print(monthly.shape)
monthly.to_csv('output.csv', index=False)


Unnamed: 0,NEIGHBOURHOOD_158,REPORT_YEAR,REPORT_MONTH,TotalCrimeScore,Crime_Count,x,y,NSI,Prev_Month_NSI,NSI_3M_Avg
0,0,2014,4,42,11,-8.824712e+06,5.435661e+06,0.914405,0.918580,0.885873
8,0,2014,5,29,8,-8.823638e+06,5.436439e+06,0.941545,0.914405,0.901183
6,0,2014,6,63,17,-8.824837e+06,5.435397e+06,0.870564,0.941545,0.924843
5,0,2014,7,52,13,-8.824463e+06,5.435727e+06,0.893528,0.870564,0.908838
1,0,2014,8,62,16,-8.824389e+06,5.435626e+06,0.872651,0.893528,0.901879
...,...,...,...,...,...,...,...,...,...,...
22385,158,2025,5,152,49,-8.844683e+06,5.421844e+06,0.684760,0.805846,0.788448
22383,158,2025,6,107,34,-8.844762e+06,5.421764e+06,0.778706,0.684760,0.727209
22382,158,2025,7,179,52,-8.844881e+06,5.421330e+06,0.628392,0.778706,0.756437
22379,158,2025,8,154,47,-8.844907e+06,5.421547e+06,0.680585,0.628392,0.697286


(21910, 10)


In [14]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        # we can play around with number of layers in the lstm
        self.num_layers = num_layers
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Get output from the last time step
        out = self.fc(out[:, -1, :])
        return out

# Model parameters - will set input_size after creating sequences
hidden_size = 64    
num_layers = 2       
output_size = 1      
seq_length = 12 
# we'll use 12 month sequence     
batch_size = 32

In [16]:
def train(model, dataloader, criterion, optimizer, epochs, device='cpu'):
    model.to(device)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')
    
    return model

In [20]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

def create_sequences_per_neighborhood(df, feature_columns, target_column, seq_length):
    X_list, y_list = [], []
    
    # Fit scalers on ALL data first
    all_features = df[feature_columns].values
    all_targets = df[target_column].values
    
    global feature_scaler, target_scaler
    feature_scaler = MinMaxScaler()
    target_scaler = MinMaxScaler()
    
    feature_scaler.fit(all_features)
    target_scaler.fit(all_targets.reshape(-1, 1))
    
    for neighborhood in df['NEIGHBOURHOOD_158'].unique():
        # Filter data for this neighborhood
        neighborhood_data = df[df['NEIGHBOURHOOD_158'] == neighborhood].copy()
        
        # Sort by time
        neighborhood_data = neighborhood_data.sort_values(['REPORT_YEAR', 'REPORT_MONTH'])
        
        # Extract features and target
        features = neighborhood_data[feature_columns].values
        target = neighborhood_data[target_column].values
        
        # Scale using the global scalers
        features_scaled = feature_scaler.transform(features)
        target_scaled = target_scaler.transform(target.reshape(-1, 1))
        
        # Create sequences
        for i in range(len(features_scaled) - seq_length):
            X_list.append(features_scaled[i:i + seq_length])
            y_list.append(target_scaled[i + seq_length])
    
    return np.array(X_list), np.array(y_list)        






In [21]:
# Use this function
feature_cols = feature_def.FEATURE_NN_COLS
target_col = feature_def.TARGET_COL
X, y = create_sequences_per_neighborhood(
    monthly, 
    feature_cols, 
    target_col, 
    seq_length=12
)

# Debug: check actual shapes
print(f"feature_cols length: {len(feature_cols)}")
print(f"X.shape: {X.shape}")
print(f"y.shape: {y.shape}")
print(f"Expected input_size for LSTM: {X.shape[2]}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# creating tensors for the datasets
X_train_tensor = torch.as_tensor(X_train).type(torch.float)
y_train_tensor = torch.as_tensor(y_train).type(torch.float)

X_test_tensor = torch.as_tensor(X_test).type(torch.float)
y_test_tensor = torch.as_tensor(y_test).type(torch.float)


train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)



batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

feature_cols length: 9
X.shape: (20002, 12, 9)
y.shape: (20002, 1)
Expected input_size for LSTM: 9


In [22]:
# Derive input_size from actual data shape
input_size = X.shape[2]  # Number of features per timestep
print(f"Setting input_size = {input_size} based on X.shape = {X.shape}")

model = LSTMModel(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    output_size=1
)


criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


trained_model = train(
    model, 
    train_loader, 
    criterion, 
    optimizer, 
    epochs=50, 
    device=device
)

model.eval()
with torch.no_grad():
    predictions_scaled = []
    actuals_scaled = []
    
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        predictions_scaled.append(outputs.cpu().numpy())
        actuals_scaled.append(targets.numpy())
    
    predictions_scaled = np.concatenate(predictions_scaled)
    actuals_scaled = np.concatenate(actuals_scaled)
    
    # Inverse transform to get actual NSI values
    predictions_nsi = target_scaler.inverse_transform(predictions_scaled)
    actuals_nsi = target_scaler.inverse_transform(actuals_scaled)
    print(predictions_nsi)
    print(predictions_scaled)

Setting input_size = 9 based on X.shape = (20002, 12, 9)
Epoch [1/50], Loss: 0.0124
Epoch [1/50], Loss: 0.0124
Epoch [2/50], Loss: 0.0028
Epoch [2/50], Loss: 0.0028
Epoch [3/50], Loss: 0.0027
Epoch [3/50], Loss: 0.0027
Epoch [4/50], Loss: 0.0027
Epoch [4/50], Loss: 0.0027
Epoch [5/50], Loss: 0.0027
Epoch [5/50], Loss: 0.0027
Epoch [6/50], Loss: 0.0027
Epoch [6/50], Loss: 0.0027
Epoch [7/50], Loss: 0.0027
Epoch [7/50], Loss: 0.0027
Epoch [8/50], Loss: 0.0026
Epoch [8/50], Loss: 0.0026
Epoch [9/50], Loss: 0.0026
Epoch [9/50], Loss: 0.0026
Epoch [10/50], Loss: 0.0026
Epoch [10/50], Loss: 0.0026
Epoch [11/50], Loss: 0.0026
Epoch [11/50], Loss: 0.0026
Epoch [12/50], Loss: 0.0026
Epoch [12/50], Loss: 0.0026
Epoch [13/50], Loss: 0.0026
Epoch [13/50], Loss: 0.0026
Epoch [14/50], Loss: 0.0026
Epoch [14/50], Loss: 0.0026
Epoch [15/50], Loss: 0.0026
Epoch [15/50], Loss: 0.0026
Epoch [16/50], Loss: 0.0026
Epoch [16/50], Loss: 0.0026
Epoch [17/50], Loss: 0.0025
Epoch [17/50], Loss: 0.0025
Epoch [18

In [23]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(actuals_nsi, predictions_nsi)
mae = mean_absolute_error(actuals_nsi, predictions_nsi)
r2 = r2_score(actuals_nsi, predictions_nsi)

print(f"\nNSI Prediction Results:")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")


NSI Prediction Results:
MSE: 0.0023
MAE: 0.0351
R² Score: 0.8352
