In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

In [8]:
crime_df = pd.read_csv("Crime_Data_from_2020_to_Present.csv")

In [9]:
top_30_crimes = crime_df['Crm Cd Desc'].value_counts().head(30).index

In [15]:
crime_df_top_30 = crime_df[crime_df['Crm Cd Desc'].isin(top_30_crimes)]

In [16]:


crime_df_f = crime_df_top_30[crime_df_top_30['Vict Sex'] == 'F']  # Female
crime_df_x = crime_df_top_30[crime_df_top_30['Vict Sex'] == 'X']  # Other/No binary


In [25]:
crime_df_m = crime_df_top_30[crime_df_top_30['Vict Sex'] == 'M']  # Masculin-{}
# Para el conjunto de datos masculino
print("Preparación y modelo para predecir el tipo de delito en víctimas masculinas")

# Eliminar filas con valores nulos en columnas clave
crime_df_m = crime_df_m.dropna(subset=['Vict Age', 'Crm Cd Desc', 'Weapon Desc', 'Premis Desc', 'DATE OCC', 'TIME OCC', 'Vict Sex'])

# ---- 1. Transformar las columnas de fechas ----
# Convertir DATE OCC a formato de fecha
crime_df_m['DATE OCC'] = pd.to_datetime(crime_df_m['DATE OCC'], errors='coerce')

# Extraer características de la fecha
crime_df_m['Month OCC'] = crime_df_m['DATE OCC'].dt.month
crime_df_m['Day OCC'] = crime_df_m['DATE OCC'].dt.day
crime_df_m['Day of Week OCC'] = crime_df_m['DATE OCC'].dt.dayofweek  # Lunes=0, Domingo=6

# Convertir la hora en cuatro categorías: madrugada, mañana, tarde, noche
crime_df_m['TIME OCC'] = crime_df_m['TIME OCC'].apply(lambda x: 'madrugada' if 0 <= x < 600 else 'mañana' if 600 <= x < 1200 else 'tarde' if 1200 <= x < 1800 else 'noche')

# ---- 2. Seleccionar características relevantes ----
categorical_columns_m = ['Vict Sex', 'Vict Descent', 'Premis Desc', 'Weapon Desc', 'AREA NAME', 'TIME OCC']  # Columnas categóricas
numerical_columns_m = ['Vict Age', 'LAT', 'LON', 'Month OCC', 'Day OCC', 'Day of Week OCC']  # Columnas numéricas

# ---- 3. Codificar características categóricas ----
# Cambia 'sparse' por 'sparse_output'
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_m_categorical = encoder.fit_transform(crime_df_m[categorical_columns_m])

# Convertir las columnas codificadas a un DataFrame
X_m_categorical_df = pd.DataFrame(X_m_categorical, columns=encoder.get_feature_names_out(categorical_columns_m))

# Combinar las columnas categóricas codificadas con las numéricas
X_m = pd.concat([crime_df_m[numerical_columns_m].reset_index(drop=True), X_m_categorical_df.reset_index(drop=True)], axis=1)

# Variable objetivo (lo que estamos intentando predecir: el tipo de delito)
y_m = crime_df_m['Crm Cd Desc']

# ---- 4. Dividir los datos en conjunto de entrenamiento y prueba ----
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_m, y_m, test_size=0.2, random_state=42)

# ---- 5. Crear y entrenar el modelo de Random Forest ----
rf_model_m = RandomForestClassifier(random_state=42)
rf_model_m.fit(X_train_m, y_train_m)

# ---- 6. Hacer predicciones y evaluar el modelo ----
y_pred_m = rf_model_m.predict(X_test_m)
accuracy_m = accuracy_score(y_test_m, y_pred_m)
print(f"Precisión del modelo para víctimas masculinas: {accuracy_m:.2f}")
print(classification_report(y_test_m, y_pred_m))
print(classification_report(y_test_m, y_pred_m))

Preparación y modelo para predecir el tipo de delito en víctimas masculinas


  crime_df_m['DATE OCC'] = pd.to_datetime(crime_df_m['DATE OCC'], errors='coerce')


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'