# Ingesting

In [1]:
import chardet
import os
import time
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
def get_file_encoding(path):
    file = open(path, "rb").read()
    encoding = chardet.detect(file)["encoding"]
    confidence = chardet.detect(file)["confidence"]
    print(encoding)
    print(confidence)
    return encoding

def extract_characteristics(start=2005, end=2021):
    columns=["Num_Acc", "an", "mois", "jour", "hrmn", "lum", "agg", "int", "atm", "col", "com", "adr", "gps", "lat", "long", "dep"]
    df = pd.DataFrame(columns=columns)
    for year in range(start, end+1):
        for file in os.listdir(data_dir):
            if file.endswith(".csv") and ("caracteristique" in file) and (str(year) in file):
                print(f"Ingesting file <{file}>...")
                path = f"{data_dir}/{file}"
                temp = pd.read_csv(path,
                                   sep="\s\\t+|;|:,",
                                   columns=columns,
                                   encoding="ISO-8859-1",
                                   on_bad_lines="skip"
                                  )
                #print(f"Table {table} has {len(temp.columns)} columns for year {year}:\n{temp.columns}")
                temp = temp.drop(["dep", "id_vehicule", "num_veh"], axis=1, errors="ignore")

def stack_csvs(data_dir, tables, n_columns):
    dfs = {}
    for table in tables:
        print(table)
        df = pd.DataFrame()
        for year in range(2005, 2022):
            for file in os.listdir(data_dir):
                if file.endswith(".csv") and (table in file) and (str(year) in file):
                    print(f"\nIngesting file <{file}>...")
                    path = f"{data_dir}/{file}"
                    # encoding = get_file_encoding(path)
                    temp = pd.read_csv(path,
                                       sep=",",
                                       encoding="ISO-8859-1",
                                       on_bad_lines="skip"
                                      )
                    print(df.columns)
                    #print(f"Table {table} has {len(temp.columns)} columns for year {year}:\n{temp.columns}")
                    #temp = temp.drop(["dep", "id_vehicule", "num_veh"], axis=1, errors="ignore")
                    if (table == "caracteristiques") and (year in [2019, 2020, 2021]):
                        temp = pd.read_csv(path,
                                         sep=";", 
                                         encoding="ISO-8859-1",
                                         on_bad_lines="skip")
                        print(temp.columns)
                        temp["gps"]= np.nan
                        print(temp.columns)
                        cols_order = ["Num_Acc", "an", "mois", "jour", "hrmn", "lum", "agg", "int", "atm", 
                                      "col", "com", "adr", "gps", "lat", "long", "dep"]
                        print(temp.columns)
                        temp = temp[cols_order]
                        print(temp.columns)

                    elif (table == "lieux") and (year not in [2019, 2020, 2021]):
                        temp = pd.read_csv(path,
                                         sep=",", 
                                         encoding="ISO-8859-1",
                                         on_bad_lines="skip")
                        temp["vma"] = np.nan
                        cols_order = ["Num_Acc", "catr", "voie", "v1", "v2", "circ", "nbv", "vosp", "prof",
                                      "pr", "pr1", "plan", "lartpc", "larrout", "surf", "infra", "situ", "vma"]
                        temp = temp[cols_order]
                    
                    elif (table == "usagers") and (year not in [2019, 2020, 2021]):
                        temp = pd.read_csv(path,
                                         sep=",", 
                                         encoding="ISO-8859-1",
                                         on_bad_lines="skip")
                        temp[["secu1", "secu2", "secu3", "id_vehicule"]] = np.nan
                        cols_order = ["Num_Acc", "id_vehicule", "num_veh", "place", "catu", "grav", "sexe",
                                      "an_nais", "trajet", "secu", "secu1", "secu2", "secu3", "locp", "actp", "etatp"]
                        temp = temp[cols_order]

                    elif (table == "usagers") and (year in [2019, 2020, 2021]):
                        temp = pd.read_csv(path,
                                         sep=";", 
                                         encoding="ISO-8859-1",
                                         on_bad_lines="skip")
                        temp[["secu"]] = np.nan
                        cols_order = ["Num_Acc", "id_vehicule", "num_veh", "place", "catu", "grav", "sexe",
                                      "an_nais", "trajet", "secu", "secu1", "secu2", "secu3", "locp", "actp", "etatp"]
                        temp = temp[cols_order]

                    elif (table == "vehicules") and (year not in [2019, 2020, 2021]):
                        temp = pd.read_csv(path,
                                         sep=",", 
                                         encoding="ISO-8859-1",
                                         on_bad_lines="skip")
                        temp[["id_vehicule", "motor"]] = np.nan
                        cols_order = ["Num_Acc", "id_vehicule", "num_veh", "senc", "catv", "obs", "obsm",
                                      "choc", "manv", "motor", "occutc"]
                        temp = temp[cols_order]

                    elif (table == "vehicules") and (year in [2019, 2020, 2021]):
                        temp = pd.read_csv(path,
                                         sep=";", 
                                         encoding="ISO-8859-1",
                                         on_bad_lines="skip")
                        cols_order = ["Num_Acc", "id_vehicule", "num_veh", "senc", "catv", "obs", "obsm",
                                      "choc", "manv", "motor", "occutc"]
                        temp = temp[cols_order]
                    
                    elif len(temp.columns) == n_columns[table]:
                        pass
                    else:
                        temp = pd.read_csv(path,
                                           sep=";", 
                                           encoding="ISO-8859-1",
                                           on_bad_lines="skip"
                                          )
                        if len(temp.columns) != n_columns[table]:
                            print(f"Year: {year}")
                            temp = pd.read_csv(path,
                                               sep="\t",
                                               encoding="ISO-8859-1",
                                               on_bad_lines="skip",
                                               engine="python"
                                              )

                    df = pd.concat([df, temp], axis=0)
                    print(df.columns)
                    print(f"Concatenated file <{file}> with {len(df.columns)} columns.")#:\n{df.columns}...")

        print(f"Saving dataframe for {table} with columns:\n{df.columns}")
        print("____")
        dfs[table] = df
    return dfs
                
def ingest_data():
    characteristics, locations, users, vehicles = stack_csvs().values()
    return 

def common_columns(dataframes):
    if len(dataframes) == 0:
        raise ValueError("At least one dataframe must be provided.")

    # Get the set of column names for the first dataframe
    common_cols = set(dataframes[0].columns)

    # Iterate through the remaining dataframes and find common columns
    for df in dataframes[1:]:
        common_cols.intersection_update(df.columns)

    return list(common_cols)


In [3]:
data_dir = "./data"
tables = ["caracteristiques", "lieux", "usagers", "vehicules"]
n_columns = {"caracteristiques": 16,
             "lieux": 18,
             "usagers": 13,
             "vehicules": 9,
            }

characteristics, locations, users, vehicles = stack_csvs(data_dir, tables, n_columns).values()

caracteristiques

Ingesting file <caracteristiques_2005.csv>...
Index([], dtype='object')
Index(['Num_Acc', 'an', 'mois', 'jour', 'hrmn', 'lum', 'agg', 'int', 'atm',
       'col', 'com', 'adr', 'gps', 'lat', 'long', 'dep'],
      dtype='object')
Concatenated file <caracteristiques_2005.csv> with 16 columns.

Ingesting file <caracteristiques_2006.csv>...
Index(['Num_Acc', 'an', 'mois', 'jour', 'hrmn', 'lum', 'agg', 'int', 'atm',
       'col', 'com', 'adr', 'gps', 'lat', 'long', 'dep'],
      dtype='object')
Index(['Num_Acc', 'an', 'mois', 'jour', 'hrmn', 'lum', 'agg', 'int', 'atm',
       'col', 'com', 'adr', 'gps', 'lat', 'long', 'dep'],
      dtype='object')
Concatenated file <caracteristiques_2006.csv> with 16 columns.

Ingesting file <caracteristiques_2007.csv>...
Index(['Num_Acc', 'an', 'mois', 'jour', 'hrmn', 'lum', 'agg', 'int', 'atm',
       'col', 'com', 'adr', 'gps', 'lat', 'long', 'dep'],
      dtype='object')
Index(['Num_Acc', 'an', 'mois', 'jour', 'hrmn', 'lum', 'agg', 'int

  temp = pd.read_csv(path,
  temp = pd.read_csv(path,


Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Concatenated file <lieux_2016.csv> with 18 columns.

Ingesting file <lieux-2017.csv>...


  temp = pd.read_csv(path,
  temp = pd.read_csv(path,


Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Concatenated file <lieux-2017.csv> with 18 columns.

Ingesting file <lieux-2018.csv>...


  temp = pd.read_csv(path,
  temp = pd.read_csv(path,


Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Concatenated file <lieux-2018.csv> with 18 columns.

Ingesting file <lieux-2019.csv>...
Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'prof',
       'pr', 'pr1', 'plan', 'lartpc', 'larrout', 'surf', 'infra', 'situ',
       'vma'],
      dtype='object')
Concatenated file <lieux-2019.csv> with 18 columns.

Ingesting file <lieux-2020.csv>...
Index(['Num_Acc', 'catr', 'voie', 'v1', 'v2', 'circ', 'nbv', 'vosp', 'pr

## II - JOIN

In [4]:
def common_columns(dataframes):
    if len(dataframes) == 0:
        raise ValueError("At least one dataframe must be provided.")

    # Get the set of column names for the first dataframe
    common_cols = set(dataframes[0].columns)

    # Iterate through the remaining dataframes and find common columns
    for df in dataframes[1:]:
        common_cols.intersection_update(df.columns)

    return list(common_cols)

In [5]:
start = time.time()

characteristics.to_csv("./data/characteristics.csv", index=False)
locations.to_csv("./data/locations.csv", index=False)
users.to_csv("./data/users.csv", index=False)
vehicles.to_csv("./data/vehicles.csv", index=False)

end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print(f"Wrote data in {round(total_time, 2)}s\nDataset has {len(users):,} rows.")

Wrote data in 137.39s
Dataset has 2,509,620 rows.


In [6]:
dfs = [characteristics, locations, users, vehicles]
columns = common_columns(dataframes=dfs)
columns

['Num_Acc']

In [7]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"Last run at {current_time}")

Last run at 01:46:59
