In [1]:
import psycopg2 # type: ignore
import pandas as pd # type: ignore

In [2]:
db_params = {
    'dbname': 'dataplatform',
    'user': 'postgres',
    'password': 'postgrespassword',
    'host': 'localhost',
    'port': 5434
}
table_name = "motor_vehicle_crashes"

In [3]:
def get_conn():
    try:
        print("Connecting to the database...")
        conn = psycopg2.connect(**db_params, options="-c client_encoding=UTF8")
        print("Connection successful!")
        return conn
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}")
        print("Please check db_params for invalid characters or database encoding.")
    except Exception as e:
        print(f"Unexpected error: {e}")

In [4]:
conn = get_conn()

Connecting to the database...
Connection successful!


In [5]:
query = f"SELECT * FROM {table_name}"

#### Collecter la données depuis la source

In [6]:
df = pd.read_sql(query, conn)

  df = pd.read_sql(query, conn)


In [None]:
df.head()

Unnamed: 0,year,casevehicleid,vehiclebodytype,registrationclass,actionpriortoaccident,truckbustypeaxles,directionoftravel,fueltype,vehicleyear,stateofregistration,numberofoccupants,enginecylinders,vehiclemake,contributingfactor1,contributingfactor1description,contributingfactor2,contributingfactor2description,eventtype,partialvin
0,2019,16400905,SUBURBAN,Not Entered,Going Straight Ahead,Not Entered,North,Not Entered,2014.0,VA,3.0,,GMC,HUMAN,Unsafe Speed,ENVMT,Pavement Slippery,,1GKKVRKD5EJ23OO86
1,2019,16400908,SUBURBAN,Not Entered,Entering Parked Position,Not Entered,East,Not Entered,2007.0,VA,1.0,,TOYT,HUMAN,Not Applicable,HUMAN,Not Applicable,Not Applicable,JTMBK31V276016643
2,2019,16400907,4 DOOR SEDAN,PASSENGER OR SUBURBAN,Backing,Not Entered,South,Gas,2008.0,NY,1.0,4.0,MAZDA,HUMAN,Backing Unsafely,HUMAN,Not Applicable,Not Applicable,1YVHP80C785M18961
3,2019,16400910,SUBURBAN,PASSENGER OR SUBURBAN,Making Left Turn,Not Entered,North,Gas,2017.0,NY,1.0,6.0,CHRYS,HUMAN,Not Applicable,HUMAN,Not Applicable,Not Applicable,2C4RC1BGXHR832560
4,2019,16400909,SUBURBAN,Not Entered,Slowing or Stopping,Not Entered,East,Not Entered,2019.0,AR,2.0,,SUBA,HUMAN,Unsafe Speed,ENVMT,Pavement Slippery,Not Applicable,JF2GTAEC8KH217364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2974595,2023,20104936,4 DOOR SEDAN,PASSENGER OR SUBURBAN,Going Straight Ahead,Not Entered,East,Gas,2007.0,NY,1.0,4.0,HONDA,HUMAN,Not Applicable,HUMAN,Alcohol Involvement,Not Entered,1HGFA16547L034999
2974596,2023,20104935,SUBURBAN,Not Entered,Parked,Not Entered,East,Not Entered,,NJ,0.0,,,HUMAN,Not Applicable,HUMAN,Not Applicable,Not Entered,
2974597,2023,20104942,4 DOOR SEDAN,Not Entered,Changing Lanes,Not Entered,East,Not Entered,,FL,1.0,,,HUMAN,Not Entered,HUMAN,Not Entered,Not Entered,
2974598,2023,20104941,SUBURBAN,PASSENGER OR SUBURBAN,Going Straight Ahead,Not Entered,East,Gas,2019.0,NY,1.0,4.0,SUBAR,HUMAN,Not Entered,HUMAN,Not Entered,Not Entered,JF2GTAEC6KH332898


#### Afficher les info globales sur ces données

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2974600 entries, 0 to 2974599
Data columns (total 19 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   year                            int64  
 1   casevehicleid                   int64  
 2   vehiclebodytype                 object 
 3   registrationclass               object 
 4   actionpriortoaccident           object 
 5   truckbustypeaxles               object 
 6   directionoftravel               object 
 7   fueltype                        object 
 8   vehicleyear                     float64
 9   stateofregistration             object 
 10  numberofoccupants               float64
 11  enginecylinders                 float64
 12  vehiclemake                     object 
 13  contributingfactor1             object 
 14  contributingfactor1description  object 
 15  contributingfactor2             object 
 16  contributingfactor2description  object 
 17  eventtype                  

## Chargement des données dans polars

In [9]:
import polars as pl # type: ignore


# Execute query and fetch data
with conn.cursor() as cursor:
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]  # Get column names

# Convert fetched data to Polars DataFrame
df = pl.DataFrame(rows, schema=columns)

# Preview the data
df.head()


  df = pl.DataFrame(rows, schema=columns)


year,casevehicleid,vehiclebodytype,registrationclass,actionpriortoaccident,truckbustypeaxles,directionoftravel,fueltype,vehicleyear,stateofregistration,numberofoccupants,enginecylinders,vehiclemake,contributingfactor1,contributingfactor1description,contributingfactor2,contributingfactor2description,eventtype,partialvin
i64,i64,str,str,str,str,str,str,i64,str,i64,i64,str,str,str,str,str,str,str
2019,16400905,"""SUBURBAN""","""Not Entered""","""Going Straight Ahead""","""Not Entered""","""North""","""Not Entered""",2014,"""VA""",3,,"""GMC""","""HUMAN""","""Unsafe Speed""","""ENVMT""","""Pavement Slippery""",,"""1GKKVRKD5EJ23OO86"""
2019,16400908,"""SUBURBAN""","""Not Entered""","""Entering Parked Position""","""Not Entered""","""East""","""Not Entered""",2007,"""VA""",1,,"""TOYT""","""HUMAN""","""Not Applicable""","""HUMAN""","""Not Applicable""","""Not Applicable""","""JTMBK31V276016643"""
2019,16400907,"""4 DOOR SEDAN""","""PASSENGER OR SUBURBAN""","""Backing""","""Not Entered""","""South""","""Gas""",2008,"""NY""",1,4.0,"""MAZDA""","""HUMAN""","""Backing Unsafely""","""HUMAN""","""Not Applicable""","""Not Applicable""","""1YVHP80C785M18961"""
2019,16400910,"""SUBURBAN""","""PASSENGER OR SUBURBAN""","""Making Left Turn""","""Not Entered""","""North""","""Gas""",2017,"""NY""",1,6.0,"""CHRYS""","""HUMAN""","""Not Applicable""","""HUMAN""","""Not Applicable""","""Not Applicable""","""2C4RC1BGXHR832560"""
2019,16400909,"""SUBURBAN""","""Not Entered""","""Slowing or Stopping""","""Not Entered""","""East""","""Not Entered""",2019,"""AR""",2,,"""SUBA""","""HUMAN""","""Unsafe Speed""","""ENVMT""","""Pavement Slippery""","""Not Applicable""","""JF2GTAEC8KH217364"""


In [10]:
df.shape

(2974600, 19)

## Examiner les colonnes et types :

In [11]:
# Afficher les noms des colonnes et leurs types
print("Noms des colonnes et leurs types de données :")
df.schema

Noms des colonnes et leurs types de données :


Schema([('year', Int64),
        ('casevehicleid', Int64),
        ('vehiclebodytype', String),
        ('registrationclass', String),
        ('actionpriortoaccident', String),
        ('truckbustypeaxles', String),
        ('directionoftravel', String),
        ('fueltype', String),
        ('vehicleyear', Int64),
        ('stateofregistration', String),
        ('numberofoccupants', Int64),
        ('enginecylinders', Int64),
        ('vehiclemake', String),
        ('contributingfactor1', String),
        ('contributingfactor1description', String),
        ('contributingfactor2', String),
        ('contributingfactor2description', String),
        ('eventtype', String),
        ('partialvin', String)])

## Résumé statistique :

In [12]:
df.describe()

statistic,year,casevehicleid,vehiclebodytype,registrationclass,actionpriortoaccident,truckbustypeaxles,directionoftravel,fueltype,vehicleyear,stateofregistration,numberofoccupants,enginecylinders,vehiclemake,contributingfactor1,contributingfactor1description,contributingfactor2,contributingfactor2description,eventtype,partialvin
str,f64,f64,str,str,str,str,str,str,f64,str,f64,f64,str,str,str,str,str,str,str
"""count""",2974600.0,2974600.0,"""2974600""","""2974600""","""2974600""","""2974600""","""2974600""","""2974600""",2437414.0,"""2718085""",2675316.0,2268707.0,"""2440820""","""2974600""","""2974600""","""2974600""","""2974600""","""2915416""","""2376288"""
"""null_count""",0.0,0.0,"""0""","""0""","""0""","""0""","""0""","""0""",537186.0,"""256515""",299284.0,705893.0,"""533780""","""0""","""0""","""0""","""0""","""59184""","""598312"""
"""mean""",2020.564611,18275000.0,,,,,,,2012.954348,,1.229771,5.131923,,,,,,,
"""std""",1.244921,1087900.0,,,,,,,6.346895,,1.245335,1.456954,,,,,,,
"""min""",2019.0,15350252.0,"""2 DOOR SEDAN""","""AGRICULTURAL COMMERCIAL""","""Avoiding Object in Roadway""","""2 axle box trailer and 3 axle …","""East""","""Compressed Natural Gas""",20.0,"""AB""",0.0,0.0,"""A""","""ENVMT""","""Accelerator Defective""","""ENVMT""","""Accelerator Defective""","""Animal, Collision With""","""0"""
"""25%""",2019.0,17336531.0,,,,,,,2009.0,,1.0,4.0,,,,,,,
"""50%""",2021.0,18286656.0,,,,,,,2014.0,,1.0,4.0,,,,,,,
"""75%""",2022.0,19222960.0,,,,,,,2018.0,,1.0,6.0,,,,,,,
"""max""",2023.0,21284225.0,"""WELL SERVICING RIG""","""VAS VOLUNTEER AMBULANCE""","""Unknown""","""Unknown""","""West""","""Propane""",2024.0,"""ZS""",687.0,16.0,"""ZZ""","""VEHICLE""","""Windshield Inadequate""","""VEHICLE""","""Windshield Inadequate""","""Unknown""","""yv1ah99x81054962"""


## Nettoyage des Données avec polars

### 1- Identifier et gérer les valeurs manquantes :

In [13]:
# Vérifier la présence de valeurs nulles dans chaque colonne
missing_values = df.select([pl.col(c).is_null().sum().alias(c) for c in df.columns])
print("Nombre de valeurs manquantes par colonne :")
missing_values

Nombre de valeurs manquantes par colonne :


year,casevehicleid,vehiclebodytype,registrationclass,actionpriortoaccident,truckbustypeaxles,directionoftravel,fueltype,vehicleyear,stateofregistration,numberofoccupants,enginecylinders,vehiclemake,contributingfactor1,contributingfactor1description,contributingfactor2,contributingfactor2description,eventtype,partialvin
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,537186,256515,299284,705893,533780,0,0,0,0,59184,598312


### 2- Supprimer les valeurs manquantes

In [14]:
# Supprimer les lignes avec des valeurs nulles
df_cleaned = df.drop_nulls()

# Vérifier les données nettoyées
print("DataFrame après suppression des valeurs nulles :")
df_cleaned.head()

DataFrame après suppression des valeurs nulles :


year,casevehicleid,vehiclebodytype,registrationclass,actionpriortoaccident,truckbustypeaxles,directionoftravel,fueltype,vehicleyear,stateofregistration,numberofoccupants,enginecylinders,vehiclemake,contributingfactor1,contributingfactor1description,contributingfactor2,contributingfactor2description,eventtype,partialvin
i64,i64,str,str,str,str,str,str,i64,str,i64,i64,str,str,str,str,str,str,str
2019,16400907,"""4 DOOR SEDAN""","""PASSENGER OR SUBURBAN""","""Backing""","""Not Entered""","""South""","""Gas""",2008,"""NY""",1,4,"""MAZDA""","""HUMAN""","""Backing Unsafely""","""HUMAN""","""Not Applicable""","""Not Applicable""","""1YVHP80C785M18961"""
2019,16400910,"""SUBURBAN""","""PASSENGER OR SUBURBAN""","""Making Left Turn""","""Not Entered""","""North""","""Gas""",2017,"""NY""",1,6,"""CHRYS""","""HUMAN""","""Not Applicable""","""HUMAN""","""Not Applicable""","""Not Applicable""","""2C4RC1BGXHR832560"""
2019,16400912,"""SUBURBAN""","""PASSENGER OR SUBURBAN""","""Going Straight Ahead""","""Not Entered""","""East""","""Gas""",2012,"""NY""",2,8,"""FORD""","""ENVMT""","""Pavement Slippery""","""VEHICLE""","""Oversized Vehicle""","""Other*, Non-Collision""","""1FBNE3BL9CDA77581"""
2019,16400911,"""4 DOOR SEDAN""","""PASSENGER OR SUBURBAN""","""Going Straight Ahead""","""Not Entered""","""West""","""Gas""",2015,"""NY""",2,4,"""CHEVR""","""HUMAN""","""Unsafe Speed""","""ENVMT""","""Pavement Slippery""","""Not Applicable""","""1G11D5SL4FU120214"""
2019,16400915,"""PICKUP TRUCK""","""PASSENGER OR SUBURBAN""","""Stopped in Traffic""","""Not Entered""","""East""","""Gas""",2001,"""NY""",1,8,"""CHEVR""","""HUMAN""","""Not Applicable""","""HUMAN""","""Not Applicable""","""Not Applicable""","""1GCHC23U41F108012"""


### vérifier si les valeurs manquantes ont été reellement supprimer

In [15]:
# Vérifier la présence de valeurs nulles dans chaque colonne
missing_values = df_cleaned.select([pl.col(c).is_null().sum().alias(c) for c in df.columns])
print("Nombre de valeurs manquantes par colonne :")
missing_values

Nombre de valeurs manquantes par colonne :


year,casevehicleid,vehiclebodytype,registrationclass,actionpriortoaccident,truckbustypeaxles,directionoftravel,fueltype,vehicleyear,stateofregistration,numberofoccupants,enginecylinders,vehiclemake,contributingfactor1,contributingfactor1description,contributingfactor2,contributingfactor2description,eventtype,partialvin
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
df_cleaned.shape

(2163960, 19)

## Identification des doublons :

In [17]:
# Identifier les doublons dans le DataFrame
duplicate_count = df.shape[0] - df.unique().shape[0]
print(f"Nombre de doublons dans le DataFrame : {duplicate_count}")

Nombre de doublons dans le DataFrame : 0


## Distribution des types de véhicules : Quantifier les différents types de véhicules impliqués.

In [18]:
vehicle_count = (
    df_cleaned
    .group_by("vehiclebodytype")
    .agg(pl.count("vehiclebodytype").alias("Count"))
    .sort("Count", descending=True)
)
vehicle_count

vehiclebodytype,Count
str,u32
"""SUBURBAN""",975416
"""4 DOOR SEDAN""",802769
"""PICKUP TRUCK""",179245
"""2 DOOR SEDAN""",57276
"""VAN TRUCK""",42298
…,…
"""ROAD ROLLER""",1
"""BICYCLIST""",1
"""DLR/TRANSPORTER""",1
"""OFF ROAD VEHICLE""",1


## Répartition par année : Identifier les tendances annuelles sur les accidents

In [19]:
year_distribution = (
    df_cleaned
    .group_by("year")
    .agg(pl.count("year").alias("Year_Count"))
    .sort("year")
)
year_distribution

year,Year_Count
i64,u32
2019,633315
2020,430157
2021,493932
2022,513396
2023,93160


## Nombre d'occupants : Étudier la corrélation entre le nombre d'occupants et les accidents.

In [20]:
occupants_distribution = df_cleaned.group_by("numberofoccupants").agg(pl.count("numberofoccupants").alias('count')).sort("count", descending=True)
occupants_distribution

numberofoccupants,count
i64,u32
1,1520314
2,312253
0,200152
3,81648
4,31695
…,…
88,1
82,1
687,1
71,1


## Facteur contributif principal : Identifier les facteurs contribuant le plus souvent aux accidents (ex. : "Contributing Factor 1").

In [21]:
contributing_factors = (
    df_cleaned
    .group_by("contributingfactor1")
    .agg(pl.count("contributingfactor1").alias("Count"))
    .sort("Count", descending=True)
)
contributing_factors

contributingfactor1,Count
str,u32
"""HUMAN""",2097323
"""ENVMT""",52015
"""VEHICLE""",14622


## Facteurs multiples : Examiner les interactions entre plusieurs facteurs.

In [22]:
factor_interactions = (
    df_cleaned
    .group_by(["contributingfactor1", "contributingfactor2"])
    .agg(pl.count("contributingfactor1").alias("Count"))
    .sort("Count", descending=True)
)
factor_interactions

contributingfactor1,contributingfactor2,Count
str,str,u32
"""HUMAN""","""HUMAN""",1878562
"""HUMAN""","""ENVMT""",189076
"""ENVMT""","""HUMAN""",47208
"""HUMAN""","""VEHICLE""",29685
"""VEHICLE""","""HUMAN""",8984
"""ENVMT""","""ENVMT""",4512
"""VEHICLE""","""VEHICLE""",3631
"""VEHICLE""","""ENVMT""",2007
"""ENVMT""","""VEHICLE""",295


## État d'enregistrement : Comparer le nombre d'accidents par État.

In [23]:
state_distribution = (
    df_cleaned
    .group_by("stateofregistration")
    .agg(pl.count("stateofregistration").alias("Count"))
    .sort("Count", descending=True)
)

state_distribution

stateofregistration,Count
str,u32
"""NY""",2163929
"""OK""",6
"""PA""",5
"""NJ""",4
"""NM""",3
…,…
"""TX""",1
"""GL""",1
"""NH""",1
"""MI""",1


## Direction de déplacement : Étudier si certains trajets sont plus risqués.

In [24]:
direction_travel = (
    df_cleaned
    .group_by("directionoftravel")
    .agg(pl.count("directionoftravel").alias("Count"))
    .sort("Count", descending=True)
)
direction_travel

directionoftravel,Count
str,u32
"""East""",483670
"""West""",475499
"""North""",470411
"""South""",465968
"""Unknown""",85477
"""Northeast""",47618
"""Southwest""",45329
"""Northwest""",45083
"""Southeast""",44621
"""Not Applicable""",284


## Âge des véhicules : Étudier la corrélation entre l'année des véhicules et les accidents.

In [25]:
df_clean = df_cleaned.with_columns(
    (2025 - pl.col("vehicleyear")).alias("Vehicle Age")
)

vehicle_age = (
    df_clean
    .group_by("vehicleyear")
    .agg(pl.count("Vehicle Age").alias("Count"))
    .sort("Count", descending=True)
)

vehicle_age

vehicleyear,Count
i64,u32
2018,183457
2017,181162
2019,169914
2016,157709
2015,150865
…,…
1935,2
1938,2
1947,2
1917,1


## Carburant utilisé : Examiner si certains types de carburants sont liés à des accidents.

In [26]:
fuel_analysis = (
    df_cleaned
    .group_by("fueltype")
    .agg(pl.count("fueltype").alias("Count"))
    .sort("Count", descending=True)
)
fuel_analysis

fueltype,Count
str,u32
"""Gas""",2065076
"""Diesel""",85105
"""Electric""",8123
"""Compressed Natural Gas""",1860
"""None""",1826
"""Flex""",1679
"""Propane""",147
"""Other""",144


## Cylindres du moteur : Vérifier si les caractéristiques du moteur influencent les collisions.

In [27]:
engine_analysis = (
    df_cleaned
    .group_by("enginecylinders")
    .agg(pl.count("enginecylinders").alias("Count"))
    .sort("Count", descending=True)
)
engine_analysis

enginecylinders,Count
i64,u32
4,1136875
6,732068
8,248036
5,14550
0,8666
…,…
7,7
9,4
11,2
14,2


## Actions avant l'accident : Étudier les comportements des conducteurs avant l'incident.

In [28]:
action_analysis = (
    df_cleaned
    .group_by("actionpriortoaccident")
    .agg(pl.count("actionpriortoaccident").alias("Count"))
    .sort("Count", descending=True)
)
action_analysis

actionpriortoaccident,Count
str,u32
"""Going Straight Ahead""",1090181
"""Parked""",224105
"""Making Left Turn""",169871
"""Stopped in Traffic""",168059
"""Slowing or Stopping""",137535
…,…
"""Avoiding Object in Roadway""",5276
"""Not Applicable""",957
"""Making Right Turn on Red""",650
"""Police Pursuit""",461


## Type d'événement : Identifier les événements les plus fréquents (collision, dérapage, etc.).

In [29]:
event_type_analysis = (
    df_cleaned
    .group_by("eventtype")
    .agg(pl.count("eventtype").alias("Count"))
    .sort("Count", descending=True)
)
event_type_analysis

eventtype,Count
str,u32
"""Not Applicable""",1314650
"""Not Entered""",695847
"""Other Motor Vehicle, Collision…",83055
"""Tree, Collision With Fixed Obj…",9361
"""Earth Embankment/Rock Cut/Ditc…",8840
…,…
"""Crash Cushion, Collision With …",165
"""Submersion, Non-Collision""",163
"""Median - End, Collision With F…",103
"""Ran Off Roadway Only, Non-Coll…",44


### Stockage de notre jeu de données après analyse dans un bucket minio

In [45]:
from minio import Minio # type: ignore
import pandas as pd # type: ignore


In [46]:
# Initialisation de MinIO
MINIO_ACCESS_KEY = "miniouser"
MINIO_SECRET_KEY = "miniopassword"
MINIO_ENDPOINT_URL = "http://localhost:9000"
BUCKET_NAME = "motor-vehicle-data-bucket"

In [47]:
storage_options={
   'key': MINIO_ACCESS_KEY,
   'secret': MINIO_SECRET_KEY,
   'endpoint_url': MINIO_ENDPOINT_URL,
}

#### Initialisons le client minio

In [48]:
client = Minio(
    "localhost:9000",
    access_key=MINIO_ACCESS_KEY,
    secret_key=MINIO_SECRET_KEY,
    secure=False  # Mettez True si vous utilisez HTTPS
)

#### Créer un bucket

In [49]:
bucket_name = "motor-vehicle-data-bucket"
if not client.bucket_exists(bucket_name):
    client.make_bucket(bucket_name)


#### Liste des buckets

In [50]:
client.list_buckets()

[Bucket('moto-vehicles-bucket'),
 Bucket('motor-vehicle-data-bucket'),
 Bucket('motor-vehicles-bucket')]

#### écrire le dataframe dans minio dans le bucket que je vien de créer

In [58]:
# Étape 1 : Sauvegarder localement en format Parquet
local_file = "df_cleaned_minio.parquet"
df_cleaned.write_parquet(local_file, compression="snappy")


In [59]:
# Étape 2 : Vérifier si le bucket existe et le créer si nécessaire
if not client.bucket_exists(BUCKET_NAME):
    client.make_bucket(BUCKET_NAME)

In [62]:
# Étape 3 : Upload du fichier dans le bucket
object_name = "raw-data/df_cleaned_minio.parquet"  # Chemin de stockage dans le bucket
with open(local_file, "rb") as f:
    client.put_object(
        BUCKET_NAME,
        object_name,
        data=f,
        length=-1,
        part_size=10 * 1024 * 1024,  # Taille de chaque partie : 10MB
    )

print(f"Le fichier a été sauvegardé sur le bucket '{BUCKET_NAME}' sous '{object_name}'.")


Le fichier a été sauvegardé sur le bucket 'motor-vehicle-data-bucket' sous 'raw-data/df_cleaned_minio.parquet'.


In [69]:
OBJECT_NAME = "raw-data/df_cleaned_minio.parquet"

In [70]:
# Étape 1 : Téléchargement du fichier Parquet localement
local_file = "df_cleaned_downloaded.parquet"
client.fget_object(BUCKET_NAME, OBJECT_NAME, local_file)

<minio.datatypes.Object at 0x207376e5ca0>

In [68]:
# Étape 2 : Lecture du fichier Parquet avec Polars
df_loaded = pl.read_parquet(local_file)

df_loaded.head()

year,casevehicleid,vehiclebodytype,registrationclass,actionpriortoaccident,truckbustypeaxles,directionoftravel,fueltype,vehicleyear,stateofregistration,numberofoccupants,enginecylinders,vehiclemake,contributingfactor1,contributingfactor1description,contributingfactor2,contributingfactor2description,eventtype,partialvin
i64,i64,str,str,str,str,str,str,i64,str,i64,i64,str,str,str,str,str,str,str
2019,16400907,"""4 DOOR SEDAN""","""PASSENGER OR SUBURBAN""","""Backing""","""Not Entered""","""South""","""Gas""",2008,"""NY""",1,4,"""MAZDA""","""HUMAN""","""Backing Unsafely""","""HUMAN""","""Not Applicable""","""Not Applicable""","""1YVHP80C785M18961"""
2019,16400910,"""SUBURBAN""","""PASSENGER OR SUBURBAN""","""Making Left Turn""","""Not Entered""","""North""","""Gas""",2017,"""NY""",1,6,"""CHRYS""","""HUMAN""","""Not Applicable""","""HUMAN""","""Not Applicable""","""Not Applicable""","""2C4RC1BGXHR832560"""
2019,16400912,"""SUBURBAN""","""PASSENGER OR SUBURBAN""","""Going Straight Ahead""","""Not Entered""","""East""","""Gas""",2012,"""NY""",2,8,"""FORD""","""ENVMT""","""Pavement Slippery""","""VEHICLE""","""Oversized Vehicle""","""Other*, Non-Collision""","""1FBNE3BL9CDA77581"""
2019,16400911,"""4 DOOR SEDAN""","""PASSENGER OR SUBURBAN""","""Going Straight Ahead""","""Not Entered""","""West""","""Gas""",2015,"""NY""",2,4,"""CHEVR""","""HUMAN""","""Unsafe Speed""","""ENVMT""","""Pavement Slippery""","""Not Applicable""","""1G11D5SL4FU120214"""
2019,16400915,"""PICKUP TRUCK""","""PASSENGER OR SUBURBAN""","""Stopped in Traffic""","""Not Entered""","""East""","""Gas""",2001,"""NY""",1,8,"""CHEVR""","""HUMAN""","""Not Applicable""","""HUMAN""","""Not Applicable""","""Not Applicable""","""1GCHC23U41F108012"""
