In [None]:
import math, time 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pmdarima.arima import auto_arima, ADFTest
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error
from mutil import *


import warnings
warnings.filterwarnings('ignore')

In [None]:
# Funcion que convierte los segundos en formato hh:mm:ss
def seconds_to_time(seconds:float):
    seconds = int(seconds)
    hours = math.floor(seconds / 3600)
    seconds = seconds - hours * 3600
    minutes = math.floor(seconds / 60)
    seconds = seconds - minutes * 60
    return f'{hours:02d}:{minutes:02d}:{seconds:02d}'

### Cargar los datasets

In [None]:
df_delitos, df_camaras = load_data()
df_delitos = preprocess_data(df_delitos, df_camaras)

### Filtros

In [None]:
# Cuenta el número de ocurrencias de cada delito
incidentes_c4_frecuencia = df_delitos['incidente_c4'].value_counts().sort_values(ascending=False)

# Filtrar df_delitos por los 10 delitos más frecuentes
df_delitos = df_delitos[df_delitos['incidente_c4'].isin(incidentes_c4_frecuencia.index[:10])]
# Filtrar df_delitos por las camaras que no empiecen con 'MC'
df_delitos = df_delitos[~df_delitos['id_camara'].str.startswith('MC')]
# Filtrar df_delitos por el sector_inicio ROMA
df_delitos = df_delitos[df_delitos['sector_inicio']=='ROMA']

# Eliminar las columnas [latitud, longitud, folio, incidente_c4] de df_delitos porque generan conflicto al momento de agrupar por fecha y hora
columns = list(df_delitos.columns)
columns = [x for x in columns if x not in ['latitud', 'longitud', 'folio', 'incidente_c4', 'colonia', 'sector_inicio']]

### Último Preprocesamiento

In [None]:
# Asignar 00 a los minutos y segundos de la hora en la columna fecha_creacion y convertir a datetime
df_delitos['fecha_creacion'] = df_delitos['fecha_creacion'].dt.strftime('%Y-%m-%d %H:00:00')
df_delitos['fecha_creacion'] = pd.to_datetime(df_delitos['fecha_creacion'], format='%Y-%m-%d %H:%M:%S')

### Revision de tabla a procesar

In [None]:
df_delitos = df_delitos.groupby('id_camara')
df_delitos.head()

In [None]:
# Análisis estádistico utilizando auto ARIMA

# save the minumum error and best error 
min_value = math.inf
max_value = -math.inf
dayofweek = {0: 'lunes', 1: 'martes', 2: 'miércoles', 3: 'jueves', 4: 'viernes', 5: 'sábado', 6: 'domingo'}

# Start timer to measure the time of execution
start = time.time()
start_day, end_day = start, start
start_hour, end_hour = start, start
start_camara, end_camara = start, start

meantime_camara, meancount_camara = 0, 0
meantime_hour, meancount_hour = 0, 0
meantime_day, meancount_day = 0, 0

# Iteration for each hour and day of the week
# for day in range(0, 7):
for day in range(6,-1,-1):
    # dictionary with the day of the week in spanish
    # day name in spanish
    day_name = dayofweek[day]
    # Save next results where there is data for the day of the week in a excel called 'consigas_[dayofweek].csv' with the columns: STV:id_camara, hora:hour_start:00-hour_start:59, dia:dayofweek
    # create a dataframe with the columns: STV:id_camara, hora:hour_start:00-hour_start:59, dia:dayofweek
    df_delitos_count_day = pd.DataFrame(columns=['STV', 'hora', 'dia'])
    df_delitos_count_day_raw = pd.DataFrame(columns=['STV', 'hora', 'dia'])
    # Start time for each day of the week
    meantime_day += round((end_day - start_day), 2)
    timeday_log = f'<{day_name} [{seconds_to_time(meantime_day/(meancount_day | 1))} per day]'
    meancount_day += 1
    start_day = time.time()
    # for hour in range(0, 24):
    for hour in range(23, -1, -1):
        # Start time for each hour
        meantime_hour += round((end_hour - start_hour), 2)
        timehour_log = f'<{hour}:00 [{seconds_to_time(meantime_hour/(meancount_hour | 1))} per hour]'
        meancount_hour += 1
        start_hour = time.time()
        i = 0
        for id_camara, df in df_delitos:
            i += 1
            meantime_camara += round((end_camara - start_camara), 2)
            timecamara_log = f'<{id_camara} ({i}/{len(df_delitos.groups)}) [{seconds_to_time(meantime_camara/(meancount_camara | 1))} per camera]'
            meancount_camara += 1
            print(f'\r [{seconds_to_time(round((end_camara - start), 2))} total by now] {timecamara_log} {timehour_log} {timeday_log}', end='\r')
            # Start time for each camera
            start_camara = time.time()
            # Count the number of crimes per day but saving the date and the id_camara and set the fecha_creacion as index
            df_delitos_count = df.groupby(columns).size().reset_index(name='count')
            # Create rows for all the hours in the year 2022 and 2023 without losing the existing data
            df_delitos_count = df_delitos_count.set_index('fecha_creacion').reindex(pd.date_range(start='2022-01-01', end='2023-02-01', freq='H')).reset_index().rename(columns={'index': 'fecha_creacion'})
            # Find not nat values
            df_delitos_count.fillna(0, inplace=True)
            # Split the data into train (last 3 months in 2022) and test (first 2 weeks in 2023)
            # DONT USE 'train = df_delitos_count[df_delitos_count.fecha_creacion.dt.year == 2022 and df_delitos_count.fecha_creacion.dt.month >= 10]' IS AMBIGUOUS
            train = df_delitos_count[(df_delitos_count.fecha_creacion.dt.year == 2022) | (df_delitos_count.fecha_creacion.dt.isocalendar().week >= 52)]
            test = df_delitos_count[(df_delitos_count.fecha_creacion.dt.year == 2023) & (df_delitos_count.fecha_creacion.dt.isocalendar().week == 1)]
            # Hacer autoarima como en la seccion de abajo para todos los lunes a las 00:00, despues todos los lunes a las 01:00, etc
            # Filter the data for the hour and day of the week
            train_hour = train[(train.fecha_creacion.dt.hour == hour) & (train.fecha_creacion.dt.dayofweek == day)]
            test_hour = test[(test.fecha_creacion.dt.hour == hour) & (test.fecha_creacion.dt.dayofweek == day)]
            # If there is no data for the hour and day of the week, continue
            if len(train_hour) == 0 or len(test_hour) == 0:
                continue
            # Fit the model
            model = auto_arima(train['count'], stationary=True,
                                    test='adf', start_p=1, d=1, start_q=0,
                                    max_p=7, max_d=7, max_q=7,
                                    # daily data
                                    m=0,
                                    # Desactivar el test
                                    # seasonal=False,
                                    seasonal=False,
                                    seasonal_test='ch', start_P=0, D=0, start_Q=0,
                                    trace=False, stepwise=True,
                                    suppress_warnings=True, error_action='ignore',
                                    random_state=0)
            # Forecast
            forecast, confint = model.predict(n_periods=len(test_hour), return_conf_int=True)
            # day as string name
            # Save the results from the forecast
            df_delitos_count_day = pd.concat([df_delitos_count_day, pd.DataFrame({'STV': id_camara, 'hora': f'{hour}:00-{hour}:59', 'dia': day_name, 'cantidad': forecast})], ignore_index=True)
            df_delitos_count_day_raw = pd.concat([df_delitos_count_day, pd.DataFrame({'STV': id_camara, 'hora': f'{hour}:00-{hour}:59', 'dia': day_name, 'cantidad': forecast})], ignore_index=True)
            # End time for each camera
            end_camara = time.time()
        # End time for each hour
        end_hour = time.time()
    # Round values to 0 if they are less than 0.30 to floor else ceil
    df_delitos_count_day['cantidad'] = df_delitos_count_day['cantidad'].apply(lambda x: math.floor(x) if x < 0.10 else math.ceil(x))
    #Drop rows tih cantidad = 0
    df_delitos_count_day = df_delitos_count_day[df_delitos_count_day['cantidad'] != 0]
    # Order the columns by dia, hora and STV
    df_delitos_count_day.sort_values(by=['dia', 'hora', 'STV'], inplace=True)
    df_delitos_count_day_raw.sort_values(by=['dia', 'hora', 'STV'], inplace=True)
    # Save the results from the forecast
    df_delitos_count_day.to_csv(f'{PATH_DATA}consigas_{day_name}.csv', index=False)
    df_delitos_count_day_raw.to_csv(f'{PATH_DATA}consigas_{day_name}_raw.csv', index=False)
    # End time for each day of the week
    end_day = time.time()

# End time for the whole script
end = time.time()
print(f'\rDone! [{seconds_to_time(round((end - start), 2))}s total]')