# Data delinquance

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import matplotlib.pyplot as plt

## Delinquance

In [3]:
path="/content/drive/MyDrive/Cours_EPSI/MSPR/MSPR_TPRE813_1/data/Criminalite/donnee-comm-data.gouv-parquet-2023-geographie2024-produit-le2024-07-05.parquet"
delinquance_brut = pd.read_parquet(path, engine='pyarrow')
#filtrage par rapport à une valeur de code commune pour verifier la bonne presence de celle-ci
delinquance_brut[delinquance_brut['CODGEO_2024'] == '13206']

Unnamed: 0,CODGEO_2024,annee,classe,unité.de.compte,valeur.publiée,faits,tauxpourmille,complementinfoval,complementinfotaux,POP,millPOP,LOG,millLOG
61768,13206,16,Coups et blessures volontaires,victime,diff,289.0,6.710007,,,43070,16,26958.736388,16
61769,13206,16,Coups et blessures volontaires intrafamiliaux,victime,diff,100.0,2.321802,,,43070,16,26958.736388,16
61770,13206,16,Autres coups et blessures volontaires,victime,diff,189.0,4.388205,,,43070,16,26958.736388,16
61771,13206,16,Violences sexuelles,victime,diff,47.0,1.091247,,,43070,16,26958.736388,16
61772,13206,16,Vols avec armes,infraction,diff,25.0,0.580450,,,43070,16,26958.736388,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3489817,13206,23,Vols dans les véhicules,véhicule,diff,977.0,24.642470,,,39647,21,27651.762234,21
3489818,13206,23,Vols d'accessoires sur véhicules,véhicule,diff,108.0,2.724040,,,39647,21,27651.762234,21
3489819,13206,23,Destructions et dégradations volontaires,infraction,diff,537.0,13.544530,,,39647,21,27651.762234,21
3489820,13206,23,Trafic de stupéfiants,Mis en cause,diff,86.0,2.169143,,,39647,21,27651.762234,21


In [4]:
delinquance_drop = delinquance_brut.drop(columns=['millPOP','millLOG','classe', 'unité.de.compte',	'valeur.publiée',	'complementinfoval',	'complementinfotaux',	'POP', 'LOG'])
delinquance_drop.tail()

Unnamed: 0,CODGEO_2024,annee,faits,tauxpourmille
3917755,97617,23,19.0,1.363571
3917756,97617,23,,
3917757,97617,23,87.0,6.24372
3917758,97617,23,,
3917759,97617,23,,


In [9]:
delinquance_rename = delinquance_drop.rename(columns={'CODGEO_2024' : 'codecommune','LIBGEO' : 'nomcommune','DEP' : 'dep', 'REG' : 'reg'})
delinquance_rename

Unnamed: 0,codecommune,annee,faits,tauxpourmille
0,01001,16,,
1,01001,16,0.0,0.000000
2,01001,16,,
3,01001,16,0.0,0.000000
4,01001,16,0.0,0.000000
...,...,...,...,...
3917755,97617,23,19.0,1.363571
3917756,97617,23,,
3917757,97617,23,87.0,6.243720
3917758,97617,23,,


In [12]:
delinquance_drop_na = delinquance_rename.dropna(subset=['annee'])
delinquance_drop_na

Unnamed: 0,codecommune,annee,faits,tauxpourmille
0,01001,16,,
1,01001,16,0.0,0.000000
2,01001,16,,
3,01001,16,0.0,0.000000
4,01001,16,0.0,0.000000
...,...,...,...,...
3917755,97617,23,19.0,1.363571
3917756,97617,23,,
3917757,97617,23,87.0,6.243720
3917758,97617,23,,


In [13]:
# conversion de string à int avec regex pour eviter les problèmes de changement de valeurs de base 10 (ex: 2A001 au lieu de 1001 par exemple)
delinquance_drop_na['codecommune'] = delinquance_drop_na['codecommune'].str.replace('[^0-9]', '', regex=True).astype('int64')

# retire les premiers 0 pour mettre au même format que les DF de mes camarades
delinquance_drop_na['annee'] = delinquance_drop_na['annee'].astype(str)
delinquance_drop_na['annee'] = delinquance_drop_na['annee'].str.replace('.0','')
delinquance_drop_na['annee'] = '20' + delinquance_drop_na['annee']
delinquance_drop_na['annee'] = delinquance_drop_na['annee'].str.replace('[^0-9]', '', regex=True).astype('int64')
delinquance_drop_na

Unnamed: 0,codecommune,annee,faits,tauxpourmille
0,1001,2016,,
1,1001,2016,0.0,0.000000
2,1001,2016,,
3,1001,2016,0.0,0.000000
4,1001,2016,0.0,0.000000
...,...,...,...,...
3917755,97617,2023,19.0,1.363571
3917756,97617,2023,,
3917757,97617,2023,87.0,6.243720
3917758,97617,2023,,


In [17]:
# On groupe les colonnes et on somme la colonne 'faits'
df_grouped = delinquance_drop_na.groupby(['codecommune','annee'])['faits'].sum().reset_index()

df_grouped.head(50)

Unnamed: 0,codecommune,annee,faits
0,1001,2016,0.0
1,1001,2017,0.0
2,1001,2018,0.0
3,1001,2019,0.0
4,1001,2020,0.0
5,1001,2021,0.0
6,1001,2022,0.0
7,1001,2023,0.0
8,1002,2016,0.0
9,1002,2017,0.0


In [19]:
df_grouped['faits']=df_grouped['faits'].astype('int64')
df_grouped.head(50)

Unnamed: 0,codecommune,annee,faits
0,1001,2016,0
1,1001,2017,0
2,1001,2018,0
3,1001,2019,0
4,1001,2020,0
5,1001,2021,0
6,1001,2022,0
7,1001,2023,0
8,1002,2016,0
9,1002,2017,0
