In [1]:
import sqlite3
import re
import pandas as pd
import plotly.express as px
from collections import Iterable
from sqlite3 import connect
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
import pmdarima as pm # Make sure to be on geopandas library so that this is properly installed
from pmdarima.model_selection import train_test_split # Make sure to be on geopandas library so that this is properly installed

  """


In [None]:
df_police_force = pd.read_csv('police force dataset grouped and preprocessed.csv', sep=';')
df_police_force_p_month = df_police_force.groupby('Month')['COUNT(*)'].sum()
df_police_force_p_month = df_police_force_p_month.to_frame()
df_police_force_p_month.reset_index(inplace = True)
df_police_force_p_month = df_police_force_p_month.rename(columns = {'COUNT(*)': 'Number of crimes'})
# Change to datetime, in order to make it a valid time series
df_police_force_p_month['Month'] = pd.to_datetime(df_police_force_p_month['Month'], format="%Y-%m", exact=True)

In [None]:
def r2_adj_from_r2(r2,n,k = 1):
    #n is the number of values (samples) in the data
    #k is the number of variables in the data --> normally 1, namely the time
    a1= (1-r2) * (1-n)
    a2= n-k-1
    a3= 1
    r2_adj = a3 - (a1/a2)
    return r2_adj

# Performance measures based on y_true and y_pred
def timeseries_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print(f'Number of months in test data: {len(y_true)}')
    print('')
    print('Evaluation metric results:-')
    print(f'(Mean Squared Error) MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
    print(f'(Mean Absolute Error) MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
    print(f'(Root Mean Square Error) RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
    print(f'(Mean Absolute Percentage Error) MAPE is : {mean_absolute_percentage_error(y_true, y_pred)}')
    # Computing R^2 by hand:
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mean_true = y_true.mean()
    R2 = 1 - (sum((y_true - y_pred)**2) / sum((y_true - mean_true)**2))
    # Same result with 
    print(f'(R-Squared) R2 is : {R2}')
    adj_R2 = r2_adj_from_r2(R2,len(y_pred))
    print(f'(adjusted R-Squared) adj-R2 is : {adj_R2}',end='\n\n')
    return metrics.mean_squared_error(y_true, y_pred),metrics.mean_absolute_error(y_true, y_pred),np.sqrt(metrics.mean_squared_error(y_true, y_pred)),mean_absolute_percentage_error(y_true, y_pred),R2,adj_R2

In [None]:
# Defining the different prediction methods
def perform_prediction(method, train, test):
    
    preds = []
    
    if method == 'RF':
        months = [x.month for x in train.index]
        years = [x.year for x in train.index]
        # Day is meaningless in our case, they are all on the 1st day of the month because of datetime 
        # convert into one matrix
        x_train = np.array([months, years]).T # Contains the info about the data, so the information, based on which
        # predictions should be made.
        
        months = [x.month for x in test.index]
        years = [x.year for x in test.index]
        # Day is meaningless in our case, they are all on the 1st day of the month because of datetime 
        # convert into one matrix
        x_test = np.array([months, years]).T
    
        # fit the model
        my_rf = RandomForestRegressor()
        my_rf.fit(x_train, train)
        model = my_rf

        # predict on the same period
        preds = my_rf.predict(x_test)
        preds = preds.tolist()
        
    if method == 'ARIMA':
        model = pm.auto_arima(train.values, seasonal=True, m=12, start_p=2, 
                              start_q=2, max_order=5, information_criterion='aic', 
                              test='kpss', seasonal_test='ocsb', stepwise=True,
                              start_params=None, method='lbfgs', trace = True,
                              scoring='mse', with_intercept="auto")
        # influence: info_crit, test, seasonal_test, method, scoring, with_intercept
        preds = model.predict(test.shape[0])

    if method == 'XGB':
        months = [x.month for x in train.index]
        years = [x.year for x in train.index]
        # Day is meaningless in our case, they are all on the 1st day of the month because of datetime 
        # convert into one matrix
        x_train = np.array([months, years]).T # Contains the info about the data, so the information, based on which
        # predictions should be made.
        
        months = [x.month for x in test.index]
        years = [x.year for x in test.index]
        # Day is meaningless in our case, they are all on the 1st day of the month because of datetime 
        # convert into one matrix
        x_test = np.array([months, years]).T
        # fit the model
        my_xgb = xgb.XGBRegressor()
        my_xgb.fit(x_train, train)
        model = my_xgb

        # predict on the same period
        preds = my_xgb.predict(x_test)
        preds = preds.tolist()
    
    return preds, model

In [None]:
def prediction_test(nr_months_in_test, ts, method): # Select number of month to predict
        
    train, test = ts.iloc[:-nr_months_in_test], ts.iloc[-nr_months_in_test:]
    train = train.set_index('Month')
    test = test.set_index('Month')
    
    # Which prediction method to use
    y_pred, model = perform_prediction(method, train, test)
    
    y_true = test['Number of crimes']
    y_true = y_true.values.tolist()
    # test data is the same as y_true
    
    return y_pred, y_true, model

In [None]:
y_pred, y_true, model = prediction_test(39, df_police_force_p_month, 'ARIMA')

In [67]:
timeseries_evaluation_metrics_func(y_pred, y_true)

Number of months in test data: 39

Evaluation metric results:-
(Mean Squared Error) MSE is : 890380135.4100586
(Mean Absolute Error) MAE is : 23057.539890927987
(Root Mean Square Error) RMSE is : 29839.238184143687
(Mean Absolute Percentage Error) MAPE is : 4.272761207684081
(R-Squared) R2 is : -1.121677890757073
(adjusted R-Squared) adj-R2 is : 3.17902053645321



(890380135.4100586,
 23057.539890927987,
 29839.238184143687,
 4.272761207684081,
 -1.121677890757073,
 3.17902053645321)

In [68]:
y_pred, y_true

(array([592885.25534286, 569821.15271515, 583744.29905802, 549852.28049969,
        520456.93859686, 526630.62177312, 501169.77377343, 537912.09840887,
        547651.77683173, 569884.94854803, 569309.92201141, 588508.22282208,
        571128.36221151, 550483.50520801, 565484.90490324, 544431.76016402,
        523247.11449551, 527426.7717589 , 509460.00997839, 531168.57994285,
        536588.67207718, 555183.67479428, 553163.4028745 , 566548.72916762,
        554960.98794805, 546703.26836197, 551110.06093719, 537263.6737518 ,
        525823.51653102, 528260.66145065, 518286.94900166, 533473.43008963,
        537561.19211632, 545701.51925437, 545785.45227655, 553329.58497901,
        546396.47553589, 537101.8732194 , 544161.6605747 ]),
 [580732,
  547800,
  582858,
  548308,
  512179,
  531272,
  509511,
  573725,
  557767,
  576331,
  569265,
  577119,
  548566,
  518887,
  544119,
  513122,
  485918,
  514274,
  490791,
  496445,
  530315,
  573629,
  553903,
  570674,
  575250,
  540

# Finding average scores for 10 runs

Some metrics (RF) are randomized for each time the model is trained. Thus we can run it multiple times and take the average

In [43]:
def average_scores_10_trainings(nr_months_in_test, ts, method):
    list_mse = []
    list_mae = []
    list_rmse = []
    list_mape = []
    list_r2 = []
    list_adj_r2 = []
    for i in range(10):
        y_pred, y_true, model = prediction_test(nr_months_in_test, ts, method)
        mse, mae, rmse, mape, r2, adj_r2 = timeseries_evaluation_metrics_func(y_pred, y_true)
        list_mse.append(mse)
        list_mae.append(mae)
        list_rmse.append(rmse)
        list_mape.append(mape)
        list_r2.append(r2)
        list_adj_r2.append(adj_r2)
        
    list_mse = np.array(list_mse)
    list_mae = np.array(list_mae)
    list_rmse = np.array(list_rmse)
    list_mape = np.array(list_mape)
    list_r2 = np.array(list_r2)
    list_adj_r2 = np.array(list_adj_r2)
    
    mse_mean = list_mse.mean()
    mae_mean = list_mae.mean()
    rmse_mean = list_rmse.mean()
    mape_mean = list_mape.mean()
    r2_mean = list_r2.mean()
    adj_r2_mean = list_adj_r2.mean()
    
    mse_sd = list_mse
    mae_sd = list_mae
    rmse_sd = list_rmse
    mape_sd = list_mape
    r2_sd = list_r2
    adj_r2_sd = list_adj_r2
    
    print(f'Number of months in test data: {len(y_true)}')
    print('')
    print('Evaluation metric results:-')
    print(f'(Average Mean Squared Error) MSE is : {mse_mean}, sd: {mse_sd.mean()}')
    print(f'(Average Mean Absolute Error) MAE is : {mae_mean}, sd: {mae_sd.mean()}')
    print(f'(Average Root Mean Square Error) RMSE is : {rmse_mean}, sd: {rmse_sd.mean()}')
    print(f'(Average Mean Absolute Percentage Error) MAPE is : {mape_mean}, sd: {mape_sd.mean()}')
    # Computing R^2 by hand:
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mean_true = y_true.mean()
    R2 = 1 - (sum((y_true - y_pred)**2) / sum((y_true - mean_true)**2))
    # Same result with 
    print(f'(Average R-Squared) R2 is : {r2_mean}, sd: {r2_sd.mean()}')
    adj_R2 = r2_adj_from_r2(R2,len(y_pred))
    print(f'(Average adjusted R-Squared) adj-R2 is : {adj_r2_mean}, sd: {adj_r2_sd.mean()}',end='\n\n')

In [48]:
average_scores_10_trainings(18, df_police_force_p_month, 'RF')



Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1020766549.0407445
(Mean Absolute Error) MAE is : 28325.870000000003
(Root Mean Square Error) RMSE is : 31949.437382225442
(Mean Absolute Percentage Error) MAPE is : 5.3470738207520245
(R-Squared) R2 is : -2.1408849448976994
(adjusted R-Squared) adj-R2 is : 4.337190253953805

Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1046872273.064567
(Mean Absolute Error) MAE is : 28757.49666666667
(Root Mean Square Error) RMSE is : 32355.405623551793
(Mean Absolute Percentage Error) MAPE is : 5.425956519012027
(R-Squared) R2 is : -2.0435471687311706
(adjusted R-Squared) adj-R2 is : 4.2337688667768685

Number of months in test data: 18

Evaluation metric results:-




(Mean Squared Error) MSE is : 1098435787.5755668
(Mean Absolute Error) MAE is : 28691.651111111114
(Root Mean Square Error) RMSE is : 33142.65812477277
(Mean Absolute Percentage Error) MAPE is : 5.384847970975674
(R-Squared) R2 is : -1.716121340091214
(adjusted R-Squared) adj-R2 is : 3.885878923846915

Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1079311002.152739
(Mean Absolute Error) MAE is : 28315.626111111105
(Root Mean Square Error) RMSE is : 32852.869009460024
(Mean Absolute Percentage Error) MAPE is : 5.314387198880436
(R-Squared) R2 is : -1.6366679270801416
(adjusted R-Squared) adj-R2 is : 3.8014596725226504

Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 992987482.3176665
(Mean Absolute Error) MAE is : 28282.46
(Root Mean Square Error) RMSE is : 31511.703894230577
(Mean Absolute Percentage Error) MAPE is : 5.355087919992299
(R-Squared) R2 is : -2.570579571011339
(adjusted R-Squared) a



Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1056608552.2409055
(Mean Absolute Error) MAE is : 28506.21277777778
(Root Mean Square Error) RMSE is : 32505.51572027285
(Mean Absolute Percentage Error) MAPE is : 5.370407349120124
(R-Squared) R2 is : -2.1571247228322656
(adjusted R-Squared) adj-R2 is : 4.354445018009282

Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1018505890.7480834
(Mean Absolute Error) MAE is : 28691.881666666668
(Root Mean Square Error) RMSE is : 31914.039085457098
(Mean Absolute Percentage Error) MAPE is : 5.422087717055158
(R-Squared) R2 is : -2.2452858868498686
(adjusted R-Squared) adj-R2 is : 4.448116254777986





Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1008050849.3129781
(Mean Absolute Error) MAE is : 28495.061111111114
(Root Mean Square Error) RMSE is : 31749.816524083697
(Mean Absolute Percentage Error) MAPE is : 5.389482349996893
(R-Squared) R2 is : -2.3453328103868016
(adjusted R-Squared) adj-R2 is : 4.554416111035977

Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1053896514.4111161
(Mean Absolute Error) MAE is : 28838.84055555555
(Root Mean Square Error) RMSE is : 32463.772337963375
(Mean Absolute Percentage Error) MAPE is : 5.441675617197126
(R-Squared) R2 is : -2.563350757777497
(adjusted R-Squared) adj-R2 is : 4.786060180138591





Number of months in test data: 18

Evaluation metric results:-
(Mean Squared Error) MSE is : 1006379967.6307223
(Mean Absolute Error) MAE is : 28137.004444444443
(Root Mean Square Error) RMSE is : 31723.49236182426
(Mean Absolute Percentage Error) MAPE is : 5.31457549798195
(R-Squared) R2 is : -1.8353984722193801
(adjusted R-Squared) adj-R2 is : 4.012610876733092

Number of months in test data: 18

Evaluation metric results:-
(Average Mean Squared Error) MSE is : 1038181486.8495089, sd: 1038181486.8495089
(Average Mean Absolute Error) MAE is : 28504.21044444444, sd: 28504.21044444444
(Average Root Mean Square Error) RMSE is : 32216.87100638419, sd: 32216.87100638419
(Average Mean Absolute Percentage Error) MAPE is : 5.37655819609637, sd: 5.37655819609637
(Average R-Squared) R2 is : -2.125429360187738, sd: -2.125429360187738
(Average adjusted R-Squared) adj-R2 is : 4.3207686951994715, sd: 4.3207686951994715

