In [3]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import clone


In [4]:
df = pd.read_csv('../data/incidencia_delictiva_limpio.csv', low_memory=False)

In [6]:
df = df.drop(['Unnamed: 0', 'municipio_hecho', 'mes'], axis=1)

In [7]:
df = df[df['anio_inicio'] == 2023]

In [11]:
df = df.sort_values(by=['fecha_hora'], ascending=True)

In [6]:
#profile = ProfileReport(df, title='Incidencia Delictiva Report')

In [7]:
#profile.to_file("incidencia_delictiva.html")

In [8]:
#df.head(20)

In [13]:
df.head(10)

Unnamed: 0,anio_inicio,mes_inicio,delito,categoria_delito,colonia_hecho,alcaldia_hecho,latitud,longitud,fecha_hora
639041,2023,Enero,LESIONES CULPOSAS POR TRANSITO VEHICULAR EN CO...,DELITO DE BAJO IMPACTO,INSURGENTES CUICUILCO,COYOACAN,19.305532,-99.186308,2023-01-01 00:09:45
639042,2023,Enero,ROBO A TRANSEUNTE EN VIA PUBLICA CON VIOLENCIA,ROBO A TRANSEUNTE EN VÍA PÚBLICA CON Y SIN VIO...,MOCTEZUMA 2A SECCIÓN,VENUSTIANO CARRANZA,19.429797,-99.098672,2023-01-01 00:41:26
675406,2023,Enero,USURPACIÓN DE IDENTIDAD,DELITO DE BAJO IMPACTO,AMPLIACIÓN GABRIEL RAMOS MILLÁN,IZTACALCO,19.390206,-99.097801,2023-01-01 00:44:29
639043,2023,Enero,ROBO DE VEHICULO DE SERVICIO PARTICULAR CON VI...,ROBO DE VEHÍCULO CON Y SIN VIOLENCIA,SAN JUAN DE ARAGÓN III SECCIÓN,GUSTAVO A. MADERO,19.451641,-99.076529,2023-01-01 00:44:43
639044,2023,Enero,LESIONES CULPOSAS POR TRANSITO VEHICULAR EN CO...,DELITO DE BAJO IMPACTO,XOTEPINGO,COYOACAN,19.323245,-99.138016,2023-01-01 00:47:23
675407,2023,Enero,ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...,ROBO DE VEHÍCULO CON Y SIN VIOLENCIA,CULHUACÁN CTM SECCIÓN VII,COYOACAN,19.320184,-99.105769,2023-01-01 00:55:25
639045,2023,Enero,ROBO DE OBJETOS,DELITO DE BAJO IMPACTO,CENTRAL DE ABASTO,IZTAPALAPA,19.377017,-99.091991,2023-01-01 01:04:29
639046,2023,Enero,DAÑO EN PROPIEDAD AJENA CULPOSA POR TRÁNSITO V...,DELITO DE BAJO IMPACTO,PUEBLO CULHUACÁN,IZTAPALAPA,19.338649,-99.108793,2023-01-01 01:33:47
639047,2023,Enero,VIOLENCIA FAMILIAR,DELITO DE BAJO IMPACTO,PENSIL SUR,MIGUEL HIDALGO,19.44576,-99.195791,2023-01-01 01:43:31
639048,2023,Enero,PERSONAS EXTRAVIADAS Y AUSENTES,DELITO DE BAJO IMPACTO,SANTA MARIA AZTAHUACAN,IZTAPALAPA,19.347793,-99.01913,2023-01-01 01:48:08


In [15]:
#Transformamos la fecha a datetime
df['fecha_hora'] = pd.to_datetime(df['fecha_hora'])

In [17]:
#Sacamos la hora en una nueva columna
df['hora'] = df['fecha_hora'].dt.hour

In [19]:
df['dia'] = df['fecha_hora'].dt.dayofweek

In [21]:
delitos = df['categoria_delito'].copy()
df_data = df.drop(['categoria_delito'], axis=1)

In [23]:
original_count = len(df)
training_size = 0.60
test_size = (1 - training_size) / 2

training_count = int(original_count * training_size)
test_count = int(original_count * test_size)
validation_count = original_count - training_count - test_count

print(training_count, test_count, validation_count, original_count)

136947 45649 45649 228245


In [25]:
#División de entrenamiento y prueba
#X = df[['hora', 'delito_encoded', 'alcaldia_encoded', 'colonia_encoded', 'dia']] 
#y = df['categoria_delito_encoded']

train_x, rest_x, train_y, rest_y = train_test_split(df_data, delitos, train_size=training_count, random_state=42)

In [27]:
train_x

Unnamed: 0,anio_inicio,mes_inicio,delito,colonia_hecho,alcaldia_hecho,latitud,longitud,fecha_hora,hora,dia
846194,2023,Noviembre,ROBO DE DOCUMENTOS,PEÑÓN DE LOS BAÑOS,VENUSTIANO CARRANZA,19.436740,-99.082682,2023-11-23 17:29:39,17,3
818035,2023,Octubre,DAÑO EN PROPIEDAD AJENA INTENCIONAL,SAN LUCAS XOCHIMANCA,XOCHIMILCO,19.240923,-99.116549,2023-10-08 16:32:16,16,6
656513,2023,Enero,ROBO DE OBJETOS,NARVARTE,BENITO JUAREZ,19.384355,-99.148849,2023-01-30 23:50:41,23,0
736564,2023,Junio,ROBO DE VEHICULO DE PEDALES,LAS AMÉRICAS,IZTAPALAPA,19.360190,-99.062929,2023-06-02 08:06:07,8,4
856183,2023,Diciembre,"NARCOMENUDEO POSESIÓN CON FINES DE VENTA, COME...",SANTA CRUZ MEYEHUALCO,IZTAPALAPA,19.351425,-99.045655,2023-12-10 11:19:27,11,6
...,...,...,...,...,...,...,...,...,...,...
758917,2023,Julio,ROBO A TRANSEUNTE EN VIA PUBLICA CON VIOLENCIA,LOS ÁNGELES,IZTAPALAPA,19.341261,-99.064214,2023-07-05 21:58:29,21,2
742735,2023,Junio,RESPONSABILIDAD PROFESIONAL Y TECNICA,ROMERO RUBIO,VENUSTIANO CARRANZA,19.439793,-99.098049,2023-06-11 16:58:40,16,6
770968,2023,Julio,ROBO DE ACCESORIOS DE AUTO,TLAPECHICO,ALVARO OBREGON,19.376745,-99.245238,2023-07-25 00:45:03,0,1
785911,2023,Agosto,ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...,BONDOJITO,GUSTAVO A. MADERO,19.462401,-99.108640,2023-08-17 19:00:10,19,3


In [29]:
#Test y validación
test_x, val_x, test_y, val_y = train_test_split(rest_x, rest_y, train_size=test_count, random_state=42)

In [31]:
test_x

Unnamed: 0,anio_inicio,mes_inicio,delito,colonia_hecho,alcaldia_hecho,latitud,longitud,fecha_hora,hora,dia
811169,2023,Septiembre,DESPOJO,PARAJE ZACATEPEC,IZTAPALAPA,19.357408,-99.018236,2023-09-27 20:31:59,20,2
795258,2023,Septiembre,AMENAZAS,BARRIO XALTOCAN,XOCHIMILCO,19.250331,-99.094552,2023-09-02 11:29:16,11,5
650149,2023,Enero,FRAUDE,SAN FRANCISCO TLALNEPANTLA,XOCHIMILCO,19.200665,-99.121922,2023-01-20 14:55:37,14,4
806495,2023,Septiembre,"NARCOMENUDEO POSESIÓN CON FINES DE VENTA, COME...",CENTRAL DE ABASTO,IZTAPALAPA,19.370520,-99.101578,2023-09-20 15:59:55,15,2
719535,2023,Mayo,VIOLENCIA FAMILIAR,LAS ARBOLEDAS,TLAHUAC,19.308470,-99.056583,2023-05-06 17:49:57,17,5
...,...,...,...,...,...,...,...,...,...,...
725960,2023,Mayo,LESIONES CULPOSAS POR TRANSITO VEHICULAR,CENTRAL DE ABASTO,IZTAPALAPA,19.379434,-99.095101,2023-05-17 01:44:14,1,2
824646,2023,Octubre,FRAUDE,DEL VALLE CENTRO,BENITO JUAREZ,19.382250,-99.169583,2023-10-18 20:36:49,20,2
858808,2023,Diciembre,DESOBEDIENCIA Y RESISTENCIA DE PARTICULARES,JOSÉ LÓPEZ PORTILLO,IZTAPALAPA,19.307779,-99.080707,2023-12-14 21:32:33,21,3
713605,2023,Abril,ROBO DE ACCESORIOS DE AUTO,BELLAVISTA,IZTAPALAPA,19.330234,-99.074931,2023-04-27 15:25:33,15,3


In [33]:
#Armar pipeline de transformación

one_hot_encoding = ColumnTransformer([
    (
        'one_hot_encode',
        OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
        [
            'delito',
            'alcaldia_hecho',
            'colonia_hecho'
        ]
    )
])

In [35]:
passthrough = ColumnTransformer([
    (
        'passthrough',
        'passthrough',
        [
            'hora',
            'dia'
        ]
    )
])

In [37]:
feature_union = FeatureUnion(
    [
        ('categorical', one_hot_encoding),
        ('pass', passthrough)
    ]
)

In [39]:
feauture_engineering_pipeline = Pipeline(
    [
        (
            'features',
            feature_union
        )
    ]
)

In [41]:
feauture_engineering_pipeline

In [43]:
feauture_engineering_pipeline.fit(train_x)

In [45]:
transformed_x = feauture_engineering_pipeline.transform(train_x)

In [47]:
train_x.shape, transformed_x.shape

((136947, 10), (136947, 1889))

In [49]:
transformed_x

array([[ 0.,  0.,  0., ...,  0., 17.,  3.],
       [ 0.,  0.,  0., ...,  0., 16.,  6.],
       [ 0.,  0.,  0., ...,  0., 23.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0., 19.,  3.],
       [ 0.,  0.,  0., ...,  0.,  0.,  6.]])

In [51]:
#Model training
feature_transformer = clone(feauture_engineering_pipeline)

features_train_x = feature_transformer.fit_transform(train_x)
features_val_x = feature_transformer.transform(val_x)

In [53]:
model = RandomForestClassifier(n_estimators=200)
model.fit(features_train_x, train_y)

In [55]:
#Validación del modelo
pred_y = model.predict(features_val_x)

In [59]:
print(accuracy_score(val_y, pred_y))
print(recall_score(val_y, pred_y, average='weighted'))
print(precision_score(val_y, pred_y, average='weighted'))

0.9994742491620846
0.9994742491620846
0.9994745628247104


In [63]:
#Construcción de pipeline final
final_inference_pipeline = Pipeline([
    ("feature_engineering", clone(feauture_engineering_pipeline)),
    ("model", RandomForestClassifier(n_estimators=200))
])

In [73]:
final_inference_pipeline.fit(train_x, train_y)

In [75]:
test_pred_y = final_inference_pipeline.predict(test_x)

In [87]:
print(accuracy_score(val_y, pred_y))
print(recall_score(val_y, pred_y, average='macro'))
print(precision_score(val_y, pred_y, average='macro'))

0.9994742491620846
0.9977586685737467
0.9999627125385304
