In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import clone


In [2]:
df = pd.read_csv('../data/incidencia_delictiva_limpio.csv', low_memory=False)

In [3]:
df = df.drop(['Unnamed: 0', 'municipio_hecho', 'mes'], axis=1)

In [4]:
df = df[df['anio_inicio'] == 2023]

In [5]:
df = df.sort_values(by=['fecha_hora'], ascending=True)

In [6]:
#profile = ProfileReport(df, title='Incidencia Delictiva Report')

In [7]:
#profile.to_file("incidencia_delictiva.html")

In [8]:
#df.head(20)

In [9]:
df.head(10)

Unnamed: 0,anio_inicio,mes_inicio,delito,categoria_delito,colonia_hecho,alcaldia_hecho,latitud,longitud,fecha_hora
639041,2023,Enero,LESIONES CULPOSAS POR TRANSITO VEHICULAR EN CO...,DELITO DE BAJO IMPACTO,INSURGENTES CUICUILCO,COYOACAN,19.305532,-99.186308,2023-01-01 00:09:45
639042,2023,Enero,ROBO A TRANSEUNTE EN VIA PUBLICA CON VIOLENCIA,ROBO A TRANSEUNTE EN VÍA PÚBLICA CON Y SIN VIO...,MOCTEZUMA 2A SECCIÓN,VENUSTIANO CARRANZA,19.429797,-99.098672,2023-01-01 00:41:26
675406,2023,Enero,USURPACIÓN DE IDENTIDAD,DELITO DE BAJO IMPACTO,AMPLIACIÓN GABRIEL RAMOS MILLÁN,IZTACALCO,19.390206,-99.097801,2023-01-01 00:44:29
639043,2023,Enero,ROBO DE VEHICULO DE SERVICIO PARTICULAR CON VI...,ROBO DE VEHÍCULO CON Y SIN VIOLENCIA,SAN JUAN DE ARAGÓN III SECCIÓN,GUSTAVO A. MADERO,19.451641,-99.076529,2023-01-01 00:44:43
639044,2023,Enero,LESIONES CULPOSAS POR TRANSITO VEHICULAR EN CO...,DELITO DE BAJO IMPACTO,XOTEPINGO,COYOACAN,19.323245,-99.138016,2023-01-01 00:47:23
675407,2023,Enero,ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...,ROBO DE VEHÍCULO CON Y SIN VIOLENCIA,CULHUACÁN CTM SECCIÓN VII,COYOACAN,19.320184,-99.105769,2023-01-01 00:55:25
639045,2023,Enero,ROBO DE OBJETOS,DELITO DE BAJO IMPACTO,CENTRAL DE ABASTO,IZTAPALAPA,19.377017,-99.091991,2023-01-01 01:04:29
639046,2023,Enero,DAÑO EN PROPIEDAD AJENA CULPOSA POR TRÁNSITO V...,DELITO DE BAJO IMPACTO,PUEBLO CULHUACÁN,IZTAPALAPA,19.338649,-99.108793,2023-01-01 01:33:47
639047,2023,Enero,VIOLENCIA FAMILIAR,DELITO DE BAJO IMPACTO,PENSIL SUR,MIGUEL HIDALGO,19.44576,-99.195791,2023-01-01 01:43:31
639048,2023,Enero,PERSONAS EXTRAVIADAS Y AUSENTES,DELITO DE BAJO IMPACTO,SANTA MARIA AZTAHUACAN,IZTAPALAPA,19.347793,-99.01913,2023-01-01 01:48:08


In [10]:
#Transformamos la fecha a datetime
df['fecha_hora'] = pd.to_datetime(df['fecha_hora'])

In [11]:
#Sacamos la hora en una nueva columna
df['hora'] = df['fecha_hora'].dt.hour

In [12]:
df['dia'] = df['fecha_hora'].dt.dayofweek

In [13]:
delitos = df['categoria_delito'].copy()
df_data = df.drop(['categoria_delito'], axis=1)

In [14]:
original_count = len(df)
training_size = 0.50
test_size = (1 - training_size) / 2

training_count = int(original_count * training_size)
test_count = int(original_count * test_size)
validation_count = original_count - training_count - test_count

print(training_count, test_count, validation_count, original_count)

114122 57061 57062 228245


In [15]:
#División de entrenamiento y prueba
#X = df[['hora', 'delito_encoded', 'alcaldia_encoded', 'colonia_encoded', 'dia']] 
#y = df['categoria_delito_encoded']

train_x, rest_x, train_y, rest_y = train_test_split(df_data, delitos, train_size=training_count, random_state=42)

In [16]:
train_x

Unnamed: 0,anio_inicio,mes_inicio,delito,colonia_hecho,alcaldia_hecho,latitud,longitud,fecha_hora,hora,dia
668572,2023,Febrero,DAÑO EN PROPIEDAD AJENA INTENCIONAL A AUTOMOVIL,EL PARAÍSO,IZTAPALAPA,19.375379,-99.029839,2023-02-18 20:37:44,20,5
801691,2023,Septiembre,FRAUDE,ARGENTINA PONIENTE,MIGUEL HIDALGO,19.460518,-99.213789,2023-09-12 17:20:50,17,1
837364,2023,Noviembre,FRAUDE,SAN JOSÉ DE LOS CEDROS,CUAJIMALPA DE MORELOS,19.364581,-99.292062,2023-11-08 23:52:13,23,2
647906,2023,Enero,USO DE DOCUMENTO FALSO,CENTRO,CUAUHTEMOC,19.427127,-99.149778,2023-01-17 12:21:20,12,1
684620,2023,Marzo,DAÑO EN PROPIEDAD AJENA CULPOSA POR TRÁNSITO V...,SAN ANDRÉS TOMATLÁN,IZTAPALAPA,19.327926,-99.104131,2023-03-14 16:16:23,16,1
...,...,...,...,...,...,...,...,...,...,...
758917,2023,Julio,ROBO A TRANSEUNTE EN VIA PUBLICA CON VIOLENCIA,LOS ÁNGELES,IZTAPALAPA,19.341261,-99.064214,2023-07-05 21:58:29,21,2
742735,2023,Junio,RESPONSABILIDAD PROFESIONAL Y TECNICA,ROMERO RUBIO,VENUSTIANO CARRANZA,19.439793,-99.098049,2023-06-11 16:58:40,16,6
770968,2023,Julio,ROBO DE ACCESORIOS DE AUTO,TLAPECHICO,ALVARO OBREGON,19.376745,-99.245238,2023-07-25 00:45:03,0,1
785911,2023,Agosto,ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...,BONDOJITO,GUSTAVO A. MADERO,19.462401,-99.108640,2023-08-17 19:00:10,19,3


In [17]:
#Test y validación
test_x, val_x, test_y, val_y = train_test_split(rest_x, rest_y, train_size=test_count, random_state=42)

In [18]:
test_x

Unnamed: 0,anio_inicio,mes_inicio,delito,colonia_hecho,alcaldia_hecho,latitud,longitud,fecha_hora,hora,dia
664618,2023,Febrero,ROBO A TRANSEUNTE EN VIA PUBLICA CON VIOLENCIA,PORTALES ORIENTE,BENITO JUAREZ,19.362714,-99.140121,2023-02-13 10:52:09,10,0
719447,2023,Mayo,VIOLENCIA FAMILIAR,SAN NICOLÁS TOTOLAPAN,LA MAGDALENA CONTRERAS,19.297701,-99.233595,2023-05-06 14:30:29,14,5
683846,2023,Marzo,USURPACIÓN DE IDENTIDAD,JUÁREZ,CUAUHTEMOC,19.424399,-99.167906,2023-03-13 16:59:38,16,0
826693,2023,Octubre,PERDIDA DE LA VIDA POR OTRAS CAUSAS,AGRÍCOLA ORIENTAL,IZTACALCO,19.405071,-99.079104,2023-10-21 22:08:17,22,5
716506,2023,Mayo,FRAUDE,MIGUEL HIDALGO 2A SECCIÓN,TLALPAN,19.280734,-99.192541,2023-05-02 14:06:52,14,1
...,...,...,...,...,...,...,...,...,...,...
858808,2023,Diciembre,DESOBEDIENCIA Y RESISTENCIA DE PARTICULARES,JOSÉ LÓPEZ PORTILLO,IZTAPALAPA,19.307779,-99.080707,2023-12-14 21:32:33,21,3
693186,2023,Marzo,ABUSO DE CONFIANZA,PEÑÓN DE LOS BAÑOS,VENUSTIANO CARRANZA,19.442820,-99.076737,2023-03-27 17:02:54,17,0
692310,2023,Marzo,ROBO DE OBJETOS,AMPLIACIÓN ASTURIAS,CUAUHTEMOC,19.407732,-99.133712,2023-03-26 07:41:03,7,6
713605,2023,Abril,ROBO DE ACCESORIOS DE AUTO,BELLAVISTA,IZTAPALAPA,19.330234,-99.074931,2023-04-27 15:25:33,15,3


In [19]:
#Armar pipeline de transformación

one_hot_encoding = ColumnTransformer([
    (
        'one_hot_encode',
        OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
        [
            'delito',
            'alcaldia_hecho',
            'colonia_hecho'
        ]
    )
])

In [20]:
passthrough = ColumnTransformer([
    (
        'passthrough',
        'passthrough',
        [
            'hora',
            'dia'
        ]
    )
])

In [21]:
feature_union = FeatureUnion(
    [
        ('categorical', one_hot_encoding),
        ('pass', passthrough)
    ]
)

In [22]:
feauture_engineering_pipeline = Pipeline(
    [
        (
            'features',
            feature_union
        )
    ]
)

In [23]:
feauture_engineering_pipeline

In [24]:
feauture_engineering_pipeline.fit(train_x)

In [25]:
transformed_x = feauture_engineering_pipeline.transform(train_x)

In [26]:
train_x.shape, transformed_x.shape

((114122, 10), (114122, 1877))

In [27]:
transformed_x

array([[ 0.,  0.,  0., ...,  0., 20.,  5.],
       [ 0.,  0.,  0., ...,  0., 17.,  1.],
       [ 0.,  0.,  0., ...,  0., 23.,  2.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0., 19.,  3.],
       [ 0.,  0.,  0., ...,  0.,  0.,  6.]])

In [28]:
#Model training
feature_transformer = clone(feauture_engineering_pipeline)

features_train_x = feature_transformer.fit_transform(train_x)
features_val_x = feature_transformer.transform(val_x)

In [29]:
model = RandomForestClassifier(n_estimators=100)
model.fit(features_train_x, train_y)

In [30]:
#Validación del modelo
pred_y = model.predict(features_val_x)

In [31]:
print(accuracy_score(val_y, pred_y))
print(recall_score(val_y, pred_y, average='weighted'))
print(precision_score(val_y, pred_y, average='weighted'))

0.9995794048578739
0.9995794048578739
0.99957960568718


In [32]:
#Construcción de pipeline final
final_inference_pipeline = Pipeline([
    ("feature_engineering", clone(feauture_engineering_pipeline)),
    ("model", RandomForestClassifier(n_estimators=100))
])

In [33]:
#final_training_dt = pd.concat([train_x, val_x])
#final_training_response = pd.concat([train_y, val_y])

In [34]:
final_inference_pipeline.fit(train_x, train_y)

In [35]:
test_pred_y = final_inference_pipeline.predict(test_x)

In [36]:
print(accuracy_score(test_pred_y, test_y))
print(recall_score(test_pred_y, test_y, average='macro'))
print(precision_score(test_pred_y, test_y, average='macro'))

0.9994041464397749
0.9999577559986482
0.9744903203651205
