## Merging Caracteristiques, Lieux, Usagers (2019-2023) datasets

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)

In [None]:
# Read CSVs into df

details2019 = pd.read_csv('../data/caracteristiques-2019.csv', sep=';')
details2020 = pd.read_csv('../data/caracteristiques-2020.csv', sep=';')
details2021 = pd.read_csv('../data/carcteristiques-2021.csv', sep=';')
details2022 = pd.read_csv('../data/carcteristiques-2022.csv', sep=';')
details2023 = pd.read_csv('../data/caracteristiques-2023.csv', sep=';')
place2019 = pd.read_csv('../data/lieux-2019.csv', sep=';')
place2020 = pd.read_csv('../data/lieux-2020.csv', sep=';')
place2021 = pd.read_csv('../data/lieux-2021.csv', sep=';')
place2022 = pd.read_csv('../data/lieux-2022.csv', sep=';')
place2023 = pd.read_csv('../data/lieux-2023.csv', sep=';')
users2019 = pd.read_csv('../data/usagers-2019.csv', sep=';')
users2020 = pd.read_csv('../data/usagers-2020.csv', sep=';')
users2021 = pd.read_csv('../data/usagers-2021.csv', sep=';')
users2022 = pd.read_csv('../data/usagers-2022.csv', sep=';')
users2023 = pd.read_csv('../data/usagers-2023.csv', sep=';')


In [None]:
# Combine details dfs

details_combined = pd.concat(
    [details2019, details2020, details2021, details2022, details2023],
    ignore_index=True
)

print(f'df shape: {details_combined.shape}')

In [None]:
# Combine place dfs

places_combined = pd.concat(
    [place2019, place2020, place2021, place2022, place2023],
    ignore_index=True
)

print(f'df shape: {place_combined.shape}')

In [None]:
# Combine users dfs

users_combined = pd.concat(
    [users2019, users2020, users2021, users2022, users2023],
    ignore_index=True
)

print(f'df shape: {users_combined.shape}')

In [None]:
# Create a new column showing how many users involved per accident
users_combined['users_involved'] = (
    users_combined.groupby('Num_Acc')['Num_Acc'].transform('count')
)

In [None]:
# Group users dataset to only show the max grave row for each accident ('Num_Acc')
users_max_grav = users_combined.loc[users_combined.groupby('Num_Acc')['grav'].idxmax()]

* Details df: There are 55k rows with no Num_Acc. These will be dropped when joining as we don't know which accident to attribute these to.
* Users df: Each accident can involve multiple users. Only keeping the row from each accident with the highest severity.
* Places df: Each accident can occur on more than one road. Will keep them all in for now before deciding on how to treat these.

In [None]:
# Use places_combined as main df. Left joins on users and details dfs
df = (
    places_combined
    .merge(users_max_grav, on='Num_Acc', how='left')
    .merge(details_combined, on='Num_Acc', how='left')
)

In [None]:
df.shape

In [None]:
# Rename columns

df.rename(columns={
    'Num_Acc': 'accident_number',
    'jour': 'day',
    'mois': 'month',
    'an': 'year',
    'hrmn': 'hour_minute',
    'lum': 'light_conditions',
    'dep': 'department',
    'com': 'commune',
    'agg': 'urban_area',
    'int': 'intersection_type',
    'atm': 'weather',
    'col': 'collision_type',
    'adr': 'road_address',
    'lat': 'latitude',
    'long': 'longitude',
    'Accident_Id': 'accident_uid',
    'catr': 'road_category',
    'voie': 'lane_type', # Actually 'road number'
    'v1': 'numerical_index_road',
    'v2': 'alphanumeric_index_road',
    'circ': 'road_layout',
    'nbv': 'num_lanes',
    'vosp': 'reserved_lane',
    'prof': 'road_profile',
    'pr': 'road_ref_1',
    'pr1': 'road_ref_2',
    'plan': 'road_shape',
    'lartpc': 'width_central_reservation',
    'larrout': 'width_carriageway',
    'surf': 'surface_condition',
    'infra': 'infrastructure',
    'situ': 'road_location',
    'vma': 'speed_limit',
    'id_vehicule': 'vehicle_id',
    'num_veh': 'vehicle_number',
    'place': 'seat_position',
    'catu': 'user_category',
    'grav': 'injury_severity',
    'sexe': 'sex',
    'an_nais': 'birth_year',
    'trajet': 'trip_purpose',
    'secu1': 'safety_device_1',
    'secu2': 'safety_device_2',
    'secu3': 'safety_device_3',
    'locp': 'pedestrian_location',
    'actp': 'pedestrian_action',
    'etatp': 'pedestrian_alone',
    'id_usager': 'user_id'
}, inplace=True)

In [None]:
# Combine day, month, year columns
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

# Create day of the week column
df['day_of_week'] = df['date'].dt.day_name()

# Drop old columns
df.drop(columns=['year', 'month', 'day'], inplace=True)

# Move new columns to the front
cols = ['date', 'day_of_week'] + [c for c in df.columns if c not in ['date', 'day_of_week']]
df = df[cols]

In [None]:
df.head()

In [None]:
# Null values

# Absolute number of missing values per column
null_counts = df.isnull().sum()

# Percentage of missing values per column
null_percentage = df.isnull().mean() * 100

# Combine null stats
missing_summary = pd.DataFrame({
    'null_count': null_counts,
    'null_percentage': null_percentage
}).sort_values(by='null_count', ascending=False)

missing_summary[missing_summary['null_percentage']>0]

In [None]:
# Export df to CSV
df.to_csv('../data/df.csv', index=False)