In [1]:
import pandas as pd
from ydata_profiling import ProfileReport

In [2]:
df = pd.read_csv('../data/incidencia_delictiva_limpio.csv', low_memory=False)

In [3]:
df = df.drop(['Unnamed: 0', 'municipio_hecho', 'mes'], axis=1)

In [4]:
df = df[df['anio_inicio'] == 2023]

In [5]:
df = df.sort_values(by=['fecha_hora'], ascending=True)

In [6]:
#profile = ProfileReport(df, title='Incidencia Delictiva Report')

In [7]:
#profile.to_file("incidencia_delictiva.html")

In [8]:
#df.head(20)

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 228245 entries, 639041 to 867294
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   anio_inicio       228245 non-null  int64  
 1   mes_inicio        228245 non-null  object 
 2   delito            228245 non-null  object 
 3   categoria_delito  228245 non-null  object 
 4   colonia_hecho     228245 non-null  object 
 5   alcaldia_hecho    228245 non-null  object 
 6   latitud           228245 non-null  float64
 7   longitud          228245 non-null  float64
 8   fecha_hora        228245 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 17.4+ MB


In [10]:
delitos = df['delito'].copy()

In [11]:
df_data = df.drop(['delito'], axis=1)

In [12]:
#calculo del temaño de datos para test y validación
original_count = len(df)
training_size = 0.60
test_size = (1 - training_size) / 2

training_count = int(original_count * training_size)
test_count = int(original_count * test_size)
validation_count = original_count - training_count - test_count

print(training_count, test_count, validation_count, original_count)

136947 45649 45649 228245


In [13]:
from sklearn.model_selection import train_test_split

train_x, rest_x, train_y, rest_y = train_test_split(df_data, delitos, train_size=training_count, random_state=42)

In [14]:
test_x, validate_x, test_y, validate_y = train_test_split(rest_x, rest_y, train_size=test_count, random_state=42)

In [15]:
print(len(train_x), len(test_x), len(validate_x))

136947 45649 45649


In [16]:
#One hot encoding
from sklearn.preprocessing import OneHotEncoder

In [17]:
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [18]:
train_x

Unnamed: 0,anio_inicio,mes_inicio,categoria_delito,colonia_hecho,alcaldia_hecho,latitud,longitud,fecha_hora
846194,2023,Noviembre,DELITO DE BAJO IMPACTO,PEÑÓN DE LOS BAÑOS,VENUSTIANO CARRANZA,19.436740,-99.082682,2023-11-23 17:29:39
818035,2023,Octubre,DELITO DE BAJO IMPACTO,SAN LUCAS XOCHIMANCA,XOCHIMILCO,19.240923,-99.116549,2023-10-08 16:32:16
656513,2023,Enero,DELITO DE BAJO IMPACTO,NARVARTE,BENITO JUAREZ,19.384355,-99.148849,2023-01-30 23:50:41
736564,2023,Junio,DELITO DE BAJO IMPACTO,LAS AMÉRICAS,IZTAPALAPA,19.360190,-99.062929,2023-06-02 08:06:07
856183,2023,Diciembre,DELITO DE BAJO IMPACTO,SANTA CRUZ MEYEHUALCO,IZTAPALAPA,19.351425,-99.045655,2023-12-10 11:19:27
...,...,...,...,...,...,...,...,...
758917,2023,Julio,ROBO A TRANSEUNTE EN VÍA PÚBLICA CON Y SIN VIO...,LOS ÁNGELES,IZTAPALAPA,19.341261,-99.064214,2023-07-05 21:58:29
742735,2023,Junio,DELITO DE BAJO IMPACTO,ROMERO RUBIO,VENUSTIANO CARRANZA,19.439793,-99.098049,2023-06-11 16:58:40
770968,2023,Julio,DELITO DE BAJO IMPACTO,TLAPECHICO,ALVARO OBREGON,19.376745,-99.245238,2023-07-25 00:45:03
785911,2023,Agosto,ROBO DE VEHÍCULO CON Y SIN VIOLENCIA,BONDOJITO,GUSTAVO A. MADERO,19.462401,-99.108640,2023-08-17 19:00:10


In [19]:
one_hot_encoder.fit(train_x[['categoria_delito', 'colonia_hecho', 'alcaldia_hecho']])
one_hot_encoder.transform(train_x[['categoria_delito', 'colonia_hecho', 'alcaldia_hecho']])

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [20]:
#Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

In [21]:
one_hot_encoding = ColumnTransformer([
    (
        'one_hot_encode',
        OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
        [
            
            "categoria_delito", 
            "colonia_hecho", 
            "alcaldia_hecho"
        ]
    )
])

In [22]:
passthrough = ColumnTransformer([
    (
        "passthrough",
        "passthrough",
        [
            "latitud",
            "longitud",
        ]
    )
])

In [23]:
features_union = FeatureUnion(
                [
                    ("categorical", one_hot_encoding),
                    ("pass", passthrough),
                ]
            )

In [24]:
feature_engineering_pipeline = Pipeline(
    [
        (
            "features",
            features_union,
        )
    ]
)

In [25]:
feature_engineering_pipeline.fit(train_x)

In [26]:
transformed_x = feature_engineering_pipeline.transform(train_x)

In [27]:
train_x.shape, transformed_x.shape

((136947, 8), (136947, 1627))

In [28]:
transformed_x

array([[  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.43674036, -99.08268236],
       [  1.        ,   0.        ,   0.        , ...,   1.        ,
         19.24092306, -99.11654949],
       [  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.38435522, -99.14884865],
       ...,
       [  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.3767452 , -99.24523838],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
         19.46240072, -99.1086401 ],
       [  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.48418   , -99.20581341]])

In [29]:
#Model
from sklearn.base import clone

feature_transformer = clone(feature_engineering_pipeline)



In [30]:
features_train_x = feature_transformer.fit_transform(train_x)


In [31]:
features_validate_x = feature_transformer.transform(validate_x)

In [32]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(features_train_x, train_y)

In [33]:
features_train_x


array([[  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.43674036, -99.08268236],
       [  1.        ,   0.        ,   0.        , ...,   1.        ,
         19.24092306, -99.11654949],
       [  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.38435522, -99.14884865],
       ...,
       [  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.3767452 , -99.24523838],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
         19.46240072, -99.1086401 ],
       [  1.        ,   0.        ,   0.        , ...,   0.        ,
         19.48418   , -99.20581341]])

In [34]:
train_y

846194                                   ROBO DE DOCUMENTOS
818035                  DAÑO EN PROPIEDAD AJENA INTENCIONAL
656513                                      ROBO DE OBJETOS
736564                          ROBO DE VEHICULO DE PEDALES
856183    NARCOMENUDEO POSESIÓN CON FINES DE VENTA, COME...
                                ...                        
758917       ROBO A TRANSEUNTE EN VIA PUBLICA CON VIOLENCIA
742735                RESPONSABILIDAD PROFESIONAL Y TECNICA
770968                           ROBO DE ACCESORIOS DE AUTO
785911    ROBO DE VEHICULO DE SERVICIO PARTICULAR SIN VI...
760996                    LESIONES INTENCIONALES POR GOLPES
Name: delito, Length: 136947, dtype: object

In [35]:
from sklearn.metrics import accuracy_score, recall_score

pred_y = model.predict(features_validate_x)

In [87]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
#report = classification_report(validate_y, pred_y, average='binary')

print(accuracy_score(validate_y, pred_y))
#print(recall_score(validate_y, pred_y))
#print(report)

f1 = f1_score(validate_y, pred_y, average='macro')
print(f1)

0.2361278450787531
0.09734475245501525
