In [53]:
import pandas as pd

## Import data

In [54]:
dataset = "https://data.buenosaires.gob.ar/dataset/delitos/resource/dbec0c29-1ada-40df-b13c-75cf3013ca42"

In [55]:
path = "../data/delitos_2023.csv"

In [56]:
df = pd.read_csv(path, sep=";")

In [57]:
df.shape

(157461, 15)

## Neihborhoods with most theft and robbery claims

In [58]:
theft_robbery_bool = df["tipo"].apply(lambda x: x in ('Hurto','Robo'))

In [59]:
df = df[theft_robbery_bool]

In [60]:
df["barrio"].value_counts()

barrio
PALERMO              11942
BALVANERA             8203
FLORES                6634
RECOLETA              6392
CABALLITO             5372
SAN NICOLAS           5134
ALMAGRO               5024
VILLA LUGANO          4430
BELGRANO              4227
BARRACAS              4212
VILLA CRESPO          3734
RETIRO                3493
CONSTITUCION          3385
MONSERRAT             3129
MATADEROS             3080
PARQUE CHACABUCO      2796
NUEVA POMPEYA         2721
VILLA DEVOTO          2650
VILLA SOLDATI         2518
VILLA URQUIZA         2451
LINIERS               2429
SAN TELMO             2117
PARQUE AVELLANEDA     2058
NUNEZ                 1993
PARQUE PATRICIOS      1945
BOCA                  1928
SAN CRISTOBAL         1867
CHACARITA             1777
SAAVEDRA              1708
BOEDO                 1669
COLEGIALES            1623
VILLA DEL PARQUE      1617
FLORESTA              1466
VILLA PUEYRREDON      1143
VILLA LURO            1120
VILLA SANTA RITA      1057
VELEZ SARSFIELD      

## Data filtering

I will filter the data to:
* Velez Sarsfield neighborhood (this is my current neighborhood)
* Theft and Robbery claims

I live in the "Velez Sarsfield" neighborhood.

In [43]:
velez_bool = df["barrio"] == "VELEZ SARSFIELD"

In [44]:
df = df[velez_bool]

In [45]:
df.shape

(1052, 15)

## Removing missing values

The neighborhood does not seem to have missing values.

In [46]:
df.isna().sum()

id-sum      0
anio        0
mes         0
dia         0
fecha       0
franja      0
tipo        0
subtipo     0
uso_arma    0
uso_moto    0
barrio      0
comuna      0
latitud     0
longitud    0
cantidad    0
dtype: int64

## Data Formatting

In [47]:
df["fecha"] = pd.to_datetime(df["fecha"])

  df["fecha"] = pd.to_datetime(df["fecha"])


In [48]:
df.dtypes

id-sum               int64
anio                 int64
mes                 object
dia                 object
fecha       datetime64[ns]
franja               int64
tipo                object
subtipo             object
uso_arma            object
uso_moto            object
barrio              object
comuna              object
latitud             object
longitud            object
cantidad             int64
dtype: object

## Import ready data

Exporting the dataset that can be uploaded the the mysql db.

In [49]:
df = df.rename(columns = {"id-sum":"claim_id"})

In [50]:
df.to_csv("rawdata.csv", index=False)

In [51]:
df.shape

(1052, 15)

In [52]:
df["subtipo"].value_counts()

subtipo
Hurto total        527
Robo total         451
Hurto automotor     64
Robo automotor      10
Name: count, dtype: int64