# Cleaning

In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
def count_unique_combinations(df, columns):
    # Create a new column containing the concatenated values of selected columns
    df["combined_values"] = df[columns].apply(lambda row: "_".join(row.values.astype(str)), axis=1)
    
    # Count the number of unique combinations in the new column
    unique_combinations_count = df["combined_values"].nunique()
    
    # Drop the temporary column
    df.drop("combined_values", axis=1, inplace=True)
    
    return unique_combinations_count

In [3]:
start = time.time()

df = pd.read_csv("./data/accidents.csv")

end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print(f"Loaded dataset in {round(total_time, 2)}s")


order = ["Num_Acc", "id_vehicule", "num_veh", "an", "mois", "jour", "hrmn",
         "lat", "long", "col", "com", "adr", "gps",
         "lum", "agg", "int", "atm", "dep", "catr", "voie", "v1",
         "v2", "circ", "nbv", "vosp", "prof", "pr", "pr1", "plan", "lartpc",
         "larrout", "surf", "infra", "situ", "vma",
         "senc", "catv", "obs", "obsm", "choc", "manv", "motor", "occutc",
         "place", "catu", "sexe", "an_nais", "trajet", "secu", "secu1",
         "secu2", "secu3", "locp", "actp", "etatp", "grav"]

df = df[order]

print(f"Number of accidents: {df['Num_Acc'].nunique():,}")
print(f"Number of vehicles involved: {df['id_vehicule'].nunique():,}")
print(f"Number of people involved: {len(df):,}")
print(f"Number of years covered: {df['an'].nunique():,}")
print(f"Dataset has {len(df):,} rows before cleaning.")

  df = pd.read_csv("./data/accidents.csv")


Loaded dataset in 29.24s
Number of accidents: 1,121,571
Number of vehicles involved: 275,146
Number of people involved: 2,509,598
Number of years covered: 17
Dataset has 2,509,598 rows before cleaning.


## Encode all years in YYYY format

In [4]:
def format_years(year):
    return f"20{year:02}" if int(year) <= 18 else str(year)

# Apply the custom function to the Series
df["an"] =df["an"].apply(format_years)

## Encode all hrmn in military format

In [5]:
df["hrmn"] = df["hrmn"].astype(str)
df["hrmn"] = df["hrmn"].str.replace(":", "")

## Encode -1 as missing

In [6]:
# Define columns where you want to replace -1 with np.nan
columns_to_replace = ["atm", "col", "circ", "vosp", "prof", "pr", "pr1", "plan", "surf", "infra", "situ",
                      "senc", "obs", "obsm", "choc", "manv", "motor", "trajet", 
                      "secu1", "secu2", "secu3", "locp", "actp", "etatp", "grav"]
# Convert -1 to np.nan in the specified columns
df[columns_to_replace] = df[columns_to_replace].replace(-1, np.nan)

# Column 'trajet' also has 0 values encoded as missing
df["trajet"] = df["trajet"].replace(0, np.nan)

## Encode categories

In [7]:
columns_to_convert = ["an", "lum", "dep", "com", "agg", "int", "catr", "pr", "catv", "place", "catu", "sexe"] \
                    + columns_to_replace

for col in columns_to_convert:
    df[col] = df[col].astype("category")

## Encode numbers

In [8]:
columns_to_convert = ["lartpc", "larrout"]

for col in columns_to_convert:
    df[col] = df[col].str.replace(",", ".")
    df[col] = pd.to_numeric(df[col])

## Encode latitude and longitude

In [9]:
# Convert latitude and longitude to numeric values
df["lat"] = pd.to_numeric(df["lat"].str.replace(",", "."), errors="coerce")
df["long"] = pd.to_numeric(df["long"].str.replace(",", "."), errors="coerce")

## Drop ID columns

Dropping columns:
- "Num_Acc": ID column, irrelevant for modeling
- "id_vehicule": ID column, irrelevant for modeling
- "gps": 56% missing values
- "v1": 56.45% missing values
- "v2": 95.25% missing values
- "adr": too many different categorical values
- "voie": too many different categorical values?
- "pr": irrelevant
- "pr1": irrelevant
- "lat": 84.90% missing values
- "long": 81.75% missing values
- "com": irrelevant
- "dep": irrelevant
- "num_veh": identifies the vehicle if more than one vehicle is involved in an accident
- "lartpc": 99.98% missing values
- "larrout": 90.64% missing values
- "vma": 85.35% missing values
- "motor": 85.38% missing values
- "occutc": 14.48% missing values, only for public transport
- "secu": 16.90% values, only for some observations
- "secu1": 85.47% values, only for some observations
- "secu2": 90.72% missing values, only for some observations
- "secu3": 99.83% too many missing values, only for some observations
- "locp": few missing values, but only for pedestrians
- "actp": few missing values, but only for pedestrians
- "etatp": few missing values, but only for pedestrians

In [10]:
df = df.drop(
    columns=[
        "Num_Acc", "id_vehicule", "gps", "v1", "v2", "adr", "voie", "pr", "pr1",
        "lat", "long", "com", "dep", "num_veh", "lartpc", "larrout", "vma", "motor",
        "occutc", "secu", "secu1", "secu2", "secu3", "locp", "actp", "etatp"
    ]
)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509598 entries, 0 to 2509597
Data columns (total 30 columns):
 #   Column   Dtype   
---  ------   -----   
 0   an       category
 1   mois     int64   
 2   jour     int64   
 3   hrmn     object  
 4   col      category
 5   lum      category
 6   agg      category
 7   int      category
 8   atm      category
 9   catr     category
 10  circ     category
 11  nbv      float64 
 12  vosp     category
 13  prof     category
 14  plan     category
 15  surf     category
 16  infra    category
 17  situ     category
 18  senc     category
 19  catv     category
 20  obs      category
 21  obsm     category
 22  choc     category
 23  manv     category
 24  place    category
 25  catu     category
 26  sexe     category
 27  an_nais  float64 
 28  trajet   category
 29  grav     category
dtypes: category(25), float64(2), int64(2), object(1)
memory usage: 155.6+ MB


## Drop dupicates

In [12]:
df = df.drop_duplicates(keep="first")

## Drop rows without predictive power

- Columns 'dep', 'id_vehicle' and 'num_veh' don't convey any predictive power, or are duplicates of information found elsewhere.

In [13]:
df = df.drop(["dep", "id_vehicule", "num_veh"], axis=1, errors="ignore")

## Drop rows with missing predicted variable

In [14]:
df = df[df["grav"].notna()]

In [15]:
start = time.time()

df.to_csv("./data/accidents_before_eda_cleaned.csv", index=False)

end = time.time()

total_time = end - start
print(f"Wrote cleaned dataset in {round(total_time, 2)}s")
print(f"Dataset has {len(df):,} rows after cleaning.")

Wrote cleaned dataset in 92.84s
Dataset has 2,505,924 rows after cleaning.


In [16]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"Last run at {current_time}")

Last run at 02:01:21
