## Merging Caracteristiques, Lieux, Usagers (2019-2023) datasets

In [5]:
import pandas as pd

pd.set_option('display.max_columns', None)

In [6]:
# Read CSVs into df

details2019 = pd.read_csv('../data/caracteristiques-2019.csv', sep=';')
details2020 = pd.read_csv('../data/caracteristiques-2020.csv', sep=';')
details2021 = pd.read_csv('../data/carcteristiques-2021.csv', sep=';')
details2022 = pd.read_csv('../data/carcteristiques-2022.csv', sep=';')
details2023 = pd.read_csv('../data/caracteristiques-2023.csv', sep=';')
place2019 = pd.read_csv('../data/lieux-2019.csv', sep=';')
place2020 = pd.read_csv('../data/lieux-2020.csv', sep=';')
place2021 = pd.read_csv('../data/lieux-2021.csv', sep=';')
place2022 = pd.read_csv('../data/lieux-2022.csv', sep=';')
place2023 = pd.read_csv('../data/lieux-2023.csv', sep=';')
users2019 = pd.read_csv('../data/usagers-2019.csv', sep=';')
users2020 = pd.read_csv('../data/usagers-2020.csv', sep=';')
users2021 = pd.read_csv('../data/usagers-2021.csv', sep=';')
users2022 = pd.read_csv('../data/usagers-2022.csv', sep=';')
users2023 = pd.read_csv('../data/usagers-2023.csv', sep=';')


  place2022 = pd.read_csv('../data/lieux-2022.csv', sep=';')
  place2023 = pd.read_csv('../data/lieux-2023.csv', sep=';')


In [7]:
# Combine details dfs

details_combined = pd.concat(
    [details2019, details2020, details2021, details2022, details2023],
    ignore_index=True
)

print(f'df shape: {details_combined.shape}')

df shape: (273226, 16)


In [9]:
# Combine place dfs

places_combined = pd.concat(
    [place2019, place2020, place2021, place2022, place2023],
    ignore_index=True
)

print(f'df shape: {places_combined.shape}')

df shape: (289264, 18)


In [10]:
# Combine users dfs

users_combined = pd.concat(
    [users2019, users2020, users2021, users2022, users2023],
    ignore_index=True
)

print(f'df shape: {users_combined.shape}')

df shape: (619971, 16)


In [11]:
# Create a new column showing how many users involved per accident
users_combined['users_involved'] = (
    users_combined.groupby('Num_Acc')['Num_Acc'].transform('count')
)

In [12]:
# Group users dataset to only show the max grave row for each accident ('Num_Acc')
users_max_grav = users_combined.loc[users_combined.groupby('Num_Acc')['grav'].idxmax()]

* Details df: There are 55k rows with no Num_Acc. These will be dropped when joining as we don't know which accident to attribute these to.
* Users df: Each accident can involve multiple users. Only keeping the row from each accident with the highest severity.
* Places df: Each accident can occur on more than one road. Will keep them all in for now before deciding on how to treat these.

In [13]:
# Use places_combined as main df. Left joins on users and details dfs
df = (
    places_combined
    .merge(users_max_grav, on='Num_Acc', how='left')
    .merge(details_combined, on='Num_Acc', how='left')
)

In [14]:
df.shape

(289264, 49)

In [15]:
# Rename columns

df.rename(columns={
    'Num_Acc': 'accident_number',
    'jour': 'day',
    'mois': 'month',
    'an': 'year',
    'hrmn': 'hour_minute',
    'lum': 'light_conditions',
    'dep': 'department',
    'com': 'commune',
    'agg': 'urban_area',
    'int': 'intersection_type',
    'atm': 'weather',
    'col': 'collision_type',
    'adr': 'road_address',
    'lat': 'latitude',
    'long': 'longitude',
    'Accident_Id': 'accident_uid',
    'catr': 'road_category',
    'voie': 'lane_type', # Actually 'road number'
    'v1': 'numerical_index_road',
    'v2': 'alphanumeric_index_road',
    'circ': 'road_layout',
    'nbv': 'num_lanes',
    'vosp': 'reserved_lane',
    'prof': 'road_profile',
    'pr': 'road_ref_1',
    'pr1': 'road_ref_2',
    'plan': 'road_shape',
    'lartpc': 'width_central_reservation',
    'larrout': 'width_carriageway',
    'surf': 'surface_condition',
    'infra': 'infrastructure',
    'situ': 'road_location',
    'vma': 'speed_limit',
    'id_vehicule': 'vehicle_id',
    'num_veh': 'vehicle_number',
    'place': 'seat_position',
    'catu': 'user_category',
    'grav': 'injury_severity',
    'sexe': 'sex',
    'an_nais': 'birth_year',
    'trajet': 'trip_purpose',
    'secu1': 'safety_device_1',
    'secu2': 'safety_device_2',
    'secu3': 'safety_device_3',
    'locp': 'pedestrian_location',
    'actp': 'pedestrian_action',
    'etatp': 'pedestrian_alone',
    'id_usager': 'user_id'
}, inplace=True)

In [16]:
# Combine day, month, year columns
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

# Create day of the week column
df['day_of_week'] = df['date'].dt.day_name()

# Drop old columns
df.drop(columns=['year', 'month', 'day'], inplace=True)

# Move new columns to the front
cols = ['date', 'day_of_week'] + [c for c in df.columns if c not in ['date', 'day_of_week']]
df = df[cols]

In [17]:
df.head()

Unnamed: 0,date,day_of_week,accident_number,road_category,lane_type,numerical_index_road,alphanumeric_index_road,road_layout,num_lanes,reserved_lane,road_profile,road_ref_1,road_ref_2,road_shape,width_central_reservation,width_carriageway,surface_condition,infrastructure,road_location,speed_limit,vehicle_id,vehicle_number,seat_position,user_category,injury_severity,sex,birth_year,trip_purpose,safety_device_1,safety_device_2,safety_device_3,pedestrian_location,pedestrian_action,pedestrian_alone,user_id,users_involved,hour_minute,light_conditions,department,commune,urban_area,intersection_type,weather,collision_type,road_address,latitude,longitude,accident_uid
0,2019-11-30,Saturday,201900000001,1,3,0.0,,3,10,0,1,6,900,2,,,1,2,1,70,138 306 524,B01,2,2,4,2,2002.0,0,1,0,-1,-1,-1,-1,,3,01:30,4.0,93,93053,1.0,1.0,1.0,2.0,AUTOROUTE A3,488962100,24701200,
1,2019-11-30,Saturday,201900000002,1,1,0.0,,1,2,0,4,3,845,2,,,1,0,1,70,138 306 523,A01,1,1,4,2,1994.0,0,1,0,-1,-1,-1,-1,,1,02:50,3.0,93,93066,1.0,1.0,1.0,6.0,AUTOROUTE A1,489307000,23688000,
2,2019-11-28,Thursday,201900000003,1,86,0.0,,3,8,0,1,10,500,3,,,1,0,1,90,138 306 520,A01,2,2,4,2,1930.0,9,1,0,-1,-1,0,-1,,4,15:15,1.0,92,92036,1.0,1.0,1.0,4.0,AUTOROUTE A86,489358718,23191744,
3,2019-11-30,Saturday,201900000004,1,4,0.0,,3,5,0,1,2,299,1,,,1,0,1,90,138 306 518,B01,2,2,4,2,1978.0,5,1,8,-1,-1,0,-1,,4,20:20,5.0,94,94069,1.0,1.0,1.0,4.0,A4,488173295,24281502,
4,2019-11-30,Saturday,201900000005,1,86,0.0,INT,1,3,0,1,41,0,3,,,1,2,1,90,138 306 516,A01,2,2,4,1,1999.0,5,1,0,-1,-1,0,-1,,3,04:00,3.0,94,94028,1.0,1.0,1.0,2.0,A86 INT,487763620,24332540,


In [18]:
# Null values

# Absolute number of missing values per column
null_counts = df.isnull().sum()

# Percentage of missing values per column
null_percentage = df.isnull().mean() * 100

# Combine null stats
missing_summary = pd.DataFrame({
    'null_count': null_counts,
    'null_percentage': null_percentage
}).sort_values(by='null_count', ascending=False)

missing_summary[missing_summary['null_percentage']>0]

Unnamed: 0,null_count,null_percentage
accident_uid,289264,100.0
width_central_reservation,288737,99.817813
alphanumeric_index_road,266104,91.993473
user_id,106584,36.846618
width_carriageway,58468,20.212678
road_address,58299,20.154254
day_of_week,55302,19.118176
date,55302,19.118176
longitude,55302,19.118176
urban_area,55302,19.118176


In [19]:
# Export df to CSV
df.to_csv('../data/df.csv', index=False)