# Merging Data for 2019 & 2020
## Import libraries & packages

In [1]:
import pandas as pd
import numpy as np

## Load CSV files : 2019-2020

In [2]:
#2020
carac_20 = pd.read_csv('data/2020/caracteristiques-2020.csv', sep=';')
veh_20= pd.read_csv('data/2020/vehicules-2020.csv',sep=';')
lieux_20= pd.read_csv('data/2020/lieux-2020.csv', sep=';')
user_20= pd.read_csv('data/2020/usagers-2020.csv', sep=';')

#2019
carac_19 = pd.read_csv('data/2019/caracteristiques-2019.csv', sep=';')
veh_19= pd.read_csv('data/2019/vehicules-2019.csv',sep=';')
lieux_19= pd.read_csv('data/2019/lieux-2019.csv', sep=';')
user_19= pd.read_csv('data/2019/usagers-2019.csv', sep=';')

## Merging files by year

In [3]:
u_19_20= pd.concat([user_19, user_20])
v_19_20= pd.concat([veh_19,veh_20])
l_19_20= pd.concat([lieux_19, lieux_20])
c_19_20= pd.concat([carac_19,carac_20])

#drop larrout & lartpc drom Lieux (most of the values are absent):
l_19_20.drop(['larrout','lartpc'],axis=1,inplace=True)

- Convert *(an, mois, jour, hrmn)* columns to one **datetime** column :

In [4]:
#split 'hrmn' column to two
c_19_20[['hour','minute']] = carac_19.hrmn.str.split(":", expand=True).astype('int')

#create a new datetime column 
c_19_20['date']= pd.to_datetime(dict(year=c_19_20.an, month=c_19_20.mois, day=c_19_20.jour, hour=c_19_20.hour, minute=c_19_20.minute) )

#drop old colmns
c_19_20.drop(['an', 'mois', 'jour','hrmn', 'hour', 'minute'], axis=1, inplace=True)

## General Infos about the tables

- diplay columns of each table:

In [5]:
print("Carac columns     : ", list(c_19_20.columns))
print("Lieux columns     : ",list(l_19_20.columns))
print("Vehicules columns : ",list(v_19_20.columns))
print("Usagers columns   : ",list(u_19_20.columns))

Carac columns     :  ['Num_Acc', 'lum', 'dep', 'com', 'agg', 'int', 'atm', 'col', 'adr', 'lat', 'long', 'date']
Lieux columns     :  ['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1', 'plan', 'surf', 'infra', 'situ', 'vma']
Vehicules columns :  ['Num_Acc', 'id_vehicule', 'num_veh', 'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'occutc']
Usagers columns   :  ['Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe', 'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp']


- diplay rows number of each table:

In [6]:
print("'User' rows                : ",u_19_20.shape[0])
print("'Vehicule' rows            : ",v_19_20.shape[0])
print("'Lieux' rows               : ",l_19_20.shape[0])
print("'Caracteristiques' rows    : ",c_19_20.shape[0])

'User' rows                :  238272
'Vehicule' rows            :  181776
'Lieux' rows               :  106584
'Caracteristiques' rows    :  106584


- Remarks about the identifiction of each table:
    - Usagers : Num_Acc is not unique; id_vehicule is not unique
    - Vehicule : Num_Acc is not unique; id_vehicule is unique
    - Lieux : Num_Acc is unique
    - Caracteristiques : Num_ACC is unique
    - num_veh should be dropped
    > **1 Accident** *('Caracteristiques' + 'Lieux')* contains **mutiple** *'Vehicule'* wich contains **multiple** *'Usager'*.

## Merging tables

- Merging 'Lieux' & 'Caractéristiques' : Num_Acc is unique between the two, we consider that 'Lieux' is an extension for 'Caractéristiques' :

In [7]:
cl_19_20= pd.merge(c_19_20, l_19_20, on='Num_Acc')

- Merging 'Vehicule' & 'Caracteristiques'+'Lieux' :

In [8]:
vcl_19_20= pd.merge(v_19_20, cl_19_20, on='Num_Acc')

- Merged dataset, wich represents road accidents victims in 2019-2020 by vehicules, places and circomstances of the accident:

In [9]:
uvcl_19_20 = pd.merge(u_19_20,vcl_19_20, on=['id_vehicule', 'num_veh', 'Num_Acc'])

#create a column for usager_id
uvcl_19_20['victim_id']= uvcl_19_20.index

#place the new columns first
cols = uvcl_19_20.columns.tolist()
cols = cols[-1:] + cols[:-1]
uvcl_19_20=uvcl_19_20[cols]

#Replace (-1) values with NaN : 
uvcl_19_20.replace(-1, np.nan, inplace=True)

- Dimensions verification:

In [10]:
# shape
print ("Dimensions of the Merged Dataset : ", uvcl_19_20.shape)

# columns
print("Dataset columns : \n",list(uvcl_19_20.columns))

Dimensions of the Merged Dataset :  (238272, 50)
Dataset columns : 
 ['victim_id', 'Num_Acc', 'id_vehicule', 'num_veh', 'place', 'catu', 'grav', 'sexe', 'an_nais', 'trajet', 'secu1', 'secu2', 'secu3', 'locp', 'actp', 'etatp', 'senc', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'occutc', 'lum', 'dep', 'com', 'agg', 'int', 'atm', 'col', 'adr', 'lat', 'long', 'date', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof', 'pr', 'pr1', 'plan', 'surf', 'infra', 'situ', 'vma']


## Save the generated dataset to a new file :

- The generated dataset is saved to *'donnes_19_20_fusionnes.csv'*  in the ***data/*** folder :

In [11]:
uvcl_19_20.to_csv(path_or_buf= 'data/donnes_19_20_fusionnes.csv',sep=';')