# Nouveau Feature Engineering

## Packages

In [108]:
import pandas as pd

# Réflexion

Afin de pouvoir construire nos différents modèles, il est nécessaire de traiter les données et de s'assurer que les colonnes conservées sont à la fois exploitable au sein du jeu de données et à la fois que le jeu de test dispose d'assez de valeurs pertinentes. Ainsi, il est nécessaire de traiter toutes les années.

# Fonctions utiles

In [109]:
def get_info(data_frame: pd.DataFrame) -> dict:
    col_info = {}
    
    for col in data_frame.columns:
        possible_values = data_frame[col].unique()
        na_count = data_frame[col].isna().sum()
        obs_count = data_frame.shape[0]
        rep_na = na_count / obs_count * 100
        
        col_info[col] = {
            'possible_values': possible_values.tolist(),
            'na_count': na_count,
            'rep_na': rep_na
        }
        
    return col_info

def display_info(data_frame_test: pd.DataFrame, data_frame_train: pd.DataFrame) -> None:
    dict_info_test = get_info(data_frame_test)
    dict_info_train = get_info(data_frame_train)
    
    sorted_info_test = sorted(dict_info_test.items(), key=lambda x: x[1]['rep_na'], reverse=True)
    sorted_info_train = sorted(dict_info_train.items(), key=lambda x: x[1]['rep_na'], reverse=True)       
    
    for col, info in sorted_info_test:
        train_info = next((train_info for train_col, train_info in sorted_info_train if train_col == col), None)
        print('*******')
        print('TEST')
        print(f"- {col}: {info['possible_values']}.")
        print(f"- {col}: {len(info['possible_values'])} valeurs uniques.")
        print(f"- {col}: {info['rep_na']:.2f}%.")
        print('TRAIN')
        print(f"- {col}: {train_info['rep_na']:.2f}%.")
        print(f"- {col}: {len(train_info['possible_values'])} valeurs uniques.")
        print(f"- {col}: {train_info['possible_values']}.")
        print('')
        
def convert_to_int(time_str: str) -> int:
    """
    Convert the string time in an integer value to allow intervals implementation
    :param time_str: 'XX:XX', 'X:XX', 'XXX', 'X', X, XXX, ...
    :return: an integer of the time according to the time format XX(hours)XX(minutes)
    """
    if isinstance(time_str, str):
        if ':' in time_str:
            hours, minutes = map(int, time_str.split(':'))
            return hours * 100 + minutes
        else:
            return int(time_str)
    else:
        return time_str
    
def generate_intervals(start, end, step) -> list[int]:
    """
    Generate the intervals from the start to the end value by step
    :param start: first value in the interval
    :param end: last value in the interval
    :param step: range between two categories
    :return: list of intervals
    """
    list_cat = []
    for i in range(start, end, step):
        list_cat.append((i, i + step))
    return list_cat

def map_to_simple_value(value: int, list_cat: list[int]) -> int:
    """
    Function to map values to simple values based on intervals
    :param list_cat: list of the intervals
    :param value: value to map
    :return: mapped value
    """
    for i, interval in enumerate(list_cat):
        if interval[0] <= value < interval[1]:
            return simple_values[i]
    return None 

def convert_float_to_int(data_frame: pd.DataFrame, col_name: str) -> None:
    """
    Convert the column type in the related dataframe from float to int
    :param data_frame: dataframe where the column must be
    :param col_name: column to parse
    """
    data_frame[col_name] = data_frame[col_name].astype(int)

# 2012/2013/2014/2015 

In [112]:
def feature_treatment_2012_to_2018(list_df: list[pd.DataFrame]) -> None:
    for df in list_df:
        # Suppression de colonnes
        df.drop(columns=['adr', 'locp', 'num_veh', 'senc'], inplace=True)
        
        # Place : si pd.NA alors 0
        df['place'] = df['place'].fillna(0)
        
        # lartpc/larrout : si > 100 (+ 100cm) conversion en m sinon conservation
        df.loc[df['lartpc'] >= 100, 'lartpc'] /= 100
        df.loc[df['larrout'] >= 100, 'larrout'] /= 100
        
        # nbv : si 4 ou + → devient/reste 4
        df.loc[df['nbv'] >= 4, 'nbv'] = 4
        
        # prof/plan/surf/situ/etatp/manv/occutc/vosp : si 0 alors 1
        df.loc[df['prof'] == 0, 'prof'] = 1
        df.loc[df['plan'] == 0, 'plan'] = 1
        df.loc[df['surf'] == 0, 'surf'] = 1
        df.loc[df['situ'] == 0, 'situ'] = 1
        df.loc[df['circ'] == 0, 'circ'] = 1
        df.loc[df['etatp'] == 0, 'etatp'] = 1
        df.loc[df['manv'] == 0, 'manv'] = 1
        df.loc[df['occutc'] == 0, 'occutc'] = 1
        df.loc[df['int'] == 0, 'int'] = 1
        df.loc[df['vosp'] == 0, 'vosp'] = 1
        
        # env1 : 99 -> 2, 3 -> 1
        df.loc[df['env1'] == 99, 'env1'] = 2
        df.loc[df['env1'] == 3, 'env1'] = 1
        
        # trajet : 0 -> 9
        df.loc[df['trajet'] == 0, 'trajet'] = 9
        
        # sexe/agg : 2 -> 1 
        df.loc[df['sexe'] == 2, 'sexe'] = 0
        df.loc[df['agg'] == 2, 'agg'] = 0
        df.loc[df['nbv'] <= 0, 'nbv'] = pd.NA
    
        # an : 12 -> 2012, ...
        df['an'] = df['Four_Digits']
        
        # hrmn : 
        def map_to_simple_value(value: int, list_cat: list[int]) -> int:
            """
            Function to map values to simple values based on intervals
            :param list_cat: list of the intervals
            :param value: value to map
            :return: mapped value
            """
            for i, interval in enumerate(list_cat):
                if interval[0] <= value < interval[1]:
                    return simple_values[i]
            return None 
        
        
        df['hrmn'] = df['hrmn'].apply(convert_to_int)
        convert_float_to_int(df, 'hrmn')
        intervals = generate_intervals(0, 2400, 100)
        simple_values = list(range(len(intervals)))
        df['hrmn'] = df['hrmn'].apply(map_to_simple_value, list_cat=intervals)
    
        # Suppression Four_Digits
        df.drop(columns=['Four_Digits'], inplace=True)   

In [113]:
train_data_2012_2015 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2012_13_14_15.csv', sep=',', low_memory=False)
test_data_2012_2015 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2012_13_14_15.csv', sep=',', low_memory=False)

dfs = [train_data_2012_2015, test_data_2012_2015]
feature_treatment_2012_to_2018(dfs)

In [147]:
# 2012
train_data_2012 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2012.csv', sep=',', low_memory=False)
test_data_2012 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2012.csv', sep=',', low_memory=False)

dfs = [train_data_2012, test_data_2012]
feature_treatment_2012_to_2018(dfs)

# 2013
train_data_2013 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2013.csv', sep=',', low_memory=False)
test_data_2013 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2013.csv', sep=',', low_memory=False)

dfs = [train_data_2013, test_data_2013]
feature_treatment_2012_to_2018(dfs)

# 2014
train_data_2014 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2014.csv', sep=',', low_memory=False)
test_data_2014 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2014.csv', sep=',', low_memory=False)

dfs = [train_data_2014, test_data_2014]
feature_treatment_2012_to_2018(dfs)

# 2015
train_data_2015 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2015.csv', sep=',', low_memory=False)
test_data_2015 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2015.csv', sep=',', low_memory=False)

dfs = [train_data_2015, test_data_2015]
feature_treatment_2012_to_2018(dfs)

# 2016

In [116]:
def feature_treatment_2016(list_df: list[pd.DataFrame]) -> None:
    for df in list_df:
        # Suppression de colonnes
        df.drop(columns=['adr', 'locp', 'num_veh'], inplace=True)
        
        # Place : si pd.NA alors 0
        df['place'] = df['place'].fillna(0)
        
        # lartpc/larrout : si > 100 (+ 100cm) conversion en m sinon conservation
        df.loc[df['lartpc'] >= 100, 'lartpc'] /= 100
        df.loc[df['larrout'] >= 100, 'larrout'] /= 100
        
        # nbv : si 4 ou + → devient/reste 4
        df.loc[df['nbv'] >= 4, 'nbv'] = 4
        
        # prof/plan/surf/situ/etatp/manv/occutc/vosp : si 0 alors 1
        df.loc[df['prof'] == 0, 'prof'] = 1
        df.loc[df['plan'] == 0, 'plan'] = 1
        df.loc[df['surf'] == 0, 'surf'] = 1
        df.loc[df['situ'] == 0, 'situ'] = 1
        df.loc[df['circ'] == 0, 'circ'] = 1
        df.loc[df['etatp'] == 0, 'etatp'] = 1
        df.loc[df['manv'] == 0, 'manv'] = 1
        df.loc[df['occutc'] == 0, 'occutc'] = 1
        df.loc[df['int'] == 0, 'int'] = 1
        df.loc[df['vosp'] == 0, 'vosp'] = 1
        
        # env1 : 99 -> 2, 3 -> 1
        df.loc[df['env1'] == 99, 'env1'] = 2
        df.loc[df['env1'] == 3, 'env1'] = 1
        
        # trajet : 0 -> 9
        df.loc[df['trajet'] == 0, 'trajet'] = 9
        
        # senc/circ : 0 -> pd.NA
        df.loc[df['senc'] == 0, 'senc'] = pd.NA
        df.loc[df['circ'] == 0, 'circ'] = pd.NA
        df.loc[df['nbv'] <= 0, 'nbv'] = pd.NA
        
        # sexe/agg : 2 -> 1 
        df.loc[df['sexe'] == 2, 'sexe'] = 0
        df.loc[df['agg'] == 2, 'agg'] = 0
    
        # an : 12 -> 2012, ...
        df['an'] = df['Four_Digits']
        
        # hrmn : 
        
        def map_to_simple_value(value: int, list_cat: list[int]) -> int:
            """
            Function to map values to simple values based on intervals
            :param list_cat: list of the intervals
            :param value: value to map
            :return: mapped value
            """
            for i, interval in enumerate(list_cat):
                if interval[0] <= value < interval[1]:
                    return simple_values[i]
            return None   
        
        
        df['hrmn'] = df['hrmn'].apply(convert_to_int)
        convert_float_to_int(df, 'hrmn')
        intervals = generate_intervals(0, 2400, 100)
        simple_values = list(range(len(intervals)))
        df['hrmn'] = df['hrmn'].apply(map_to_simple_value, list_cat=intervals)
    
        # Suppression Four_Digits
        df.drop(columns=['Four_Digits'], inplace=True)   

In [117]:
train_data_2016 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2016.csv', sep=',', low_memory=False)
test_data_2016 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2016.csv', sep=',', low_memory=False)

dfs = [train_data_2016, test_data_2016]
feature_treatment_2016(dfs)

# 2017

In [118]:
def feature_treatment_2017(list_df: list[pd.DataFrame]) -> None:
    for df in list_df:
        # Suppression de colonnes
        df.drop(columns=['adr', 'num_veh', 'actp', 'locp', 'etatp'], inplace=True)
        
        # Place : si pd.NA alors 0
        df['place'] = df['place'].fillna(0)
        
        # lartpc/larrout : si > 100 (+ 100cm) conversion en m sinon conservation
        df.loc[df['lartpc'] >= 100, 'lartpc'] /= 100
        df.loc[df['larrout'] >= 100, 'larrout'] /= 100
        
        # nbv : si 4 ou + → devient/reste 4
        df.loc[df['nbv'] >= 4, 'nbv'] = 4
        df.loc[df['nbv'] <= 0, 'nbv'] = pd.NA
        
        # prof/plan/surf/situ/etatp/manv/occutc/vosp : si 0 alors 1
        df.loc[df['prof'] == 0, 'prof'] = 1
        df.loc[df['plan'] == 0, 'plan'] = 1
        df.loc[df['surf'] == 0, 'surf'] = 1
        df.loc[df['situ'] == 0, 'situ'] = 1
        df.loc[df['circ'] == 0, 'circ'] = 1
        df.loc[df['manv'] == 0, 'manv'] = 1
        df.loc[df['occutc'] == 0, 'occutc'] = 1
        df.loc[df['int'] == 0, 'int'] = 1
        df.loc[df['vosp'] == 0, 'vosp'] = 1
        
        # env1 : 99 -> 2, 3 -> 1
        df.loc[df['env1'] == 99, 'env1'] = 2
        df.loc[df['env1'] == 3, 'env1'] = 1
        
        # trajet : 0 -> 9
        df.loc[df['trajet'] == 0, 'trajet'] = 9
        
        # senc/circ : 0 -> pd.NA
        df.loc[df['senc'] == 0, 'senc'] = pd.NA
        df.loc[df['circ'] == 0, 'circ'] = pd.NA
        
        # sexe/agg : 2 -> 1 
        df.loc[df['sexe'] == 2, 'sexe'] = 0
        df.loc[df['agg'] == 2, 'agg'] = 0
    
        # an : 12 -> 2012, ...
        df['an'] = df['Four_Digits']
        
        # hrmn : 
        def map_to_simple_value(value: int, list_cat: list[int]) -> int:
            """
            Function to map values to simple values based on intervals
            :param list_cat: list of the intervals
            :param value: value to map
            :return: mapped value
            """
            for i, interval in enumerate(list_cat):
                if interval[0] <= value < interval[1]:
                    return simple_values[i]
            return None       
        
        df['hrmn'] = df['hrmn'].apply(convert_to_int)
        convert_float_to_int(df, 'hrmn')
        intervals = generate_intervals(0, 2400, 100)
        simple_values = list(range(len(intervals)))
        df['hrmn'] = df['hrmn'].apply(map_to_simple_value, list_cat=intervals)
        
        # lat/long en mètres, normalisation
        df['lat'] = df['lat'].astype(float)
        df['long'] = df['long'].astype(float)
        df.loc[df['lat'].notna(), 'lat'] /= 100000
        df.loc[df['long'].notna(), 'long'] /= 100000
    
        # Suppression Four_Digits
        df.drop(columns=['Four_Digits'], inplace=True)   

In [119]:
train_data_2017 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2017.csv', sep=',', low_memory=False)
test_data_2017 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2017.csv', sep=',', low_memory=False)

dfs = [train_data_2017, test_data_2017]
feature_treatment_2017(dfs)

# 2018

In [121]:
def feature_treatment_2018(list_df: list[pd.DataFrame]) -> None:
    for df in list_df:
        # Suppression de colonnes
        df.drop(columns=['adr', 'num_veh', 'locp', 'etatp'], inplace=True)
        
        # Place : si pd.NA alors 0
        df['place'] = df['place'].fillna(0)
        
        # lartpc/larrout : si > 100 (+ 100cm) conversion en m sinon conservation
        df.loc[df['lartpc'] >= 100, 'lartpc'] /= 100
        df.loc[df['larrout'] >= 100, 'larrout'] /= 100
        
        # nbv : si 4 ou + → devient/reste 4
        df.loc[df['nbv'] >= 4, 'nbv'] = 4
        df.loc[df['nbv'] <= 0, 'nbv'] = pd.NA
        
        # prof/plan/surf/situ/etatp/manv/occutc/vosp : si 0 alors 1
        df.loc[df['prof'] == 0, 'prof'] = 1
        df.loc[df['plan'] == 0, 'plan'] = 1
        df.loc[df['surf'] == 0, 'surf'] = 1
        df.loc[df['situ'] == 0, 'situ'] = 1
        df.loc[df['circ'] == 0, 'circ'] = 1
        df.loc[df['manv'] == 0, 'manv'] = 1
        df.loc[df['occutc'] == 0, 'occutc'] = 1
        df.loc[df['int'] == 0, 'int'] = 1
        df.loc[df['vosp'] == 0, 'vosp'] = 1
        
        # env1 : 99 -> 2, 3 -> 1
        df.loc[df['env1'] == 99, 'env1'] = 2
        df.loc[df['env1'] == 3, 'env1'] = 1
        
        # trajet : 0 -> 9
        df.loc[df['trajet'] == 0, 'trajet'] = 9
        
        # senc/circ : 0 -> pd.NA
        df.loc[df['senc'] == 0, 'senc'] = pd.NA
        df.loc[df['circ'] == 0, 'circ'] = pd.NA
        
        # sexe/agg : 2 -> 1 
        df.loc[df['sexe'] == 2, 'sexe'] = 0
        df.loc[df['agg'] == 2, 'agg'] = 0
    
        # an : 12 -> 2012, ...
        df['an'] = df['Four_Digits']
        
        # hrmn :
        def map_to_simple_value(value: int, list_cat: list[int]) -> int:
            """
            Function to map values to simple values based on intervals
            :param list_cat: list of the intervals
            :param value: value to map
            :return: mapped value
            """
            for i, interval in enumerate(list_cat):
                if interval[0] <= value < interval[1]:
                    return simple_values[i]
            return None   
        
        
        df['hrmn'] = df['hrmn'].apply(convert_to_int)
        convert_float_to_int(df, 'hrmn')
        intervals = generate_intervals(0, 2400, 100)
        simple_values = list(range(len(intervals)))
        df['hrmn'] = df['hrmn'].apply(map_to_simple_value, list_cat=intervals)
    
        # Suppression Four_Digits
        df.drop(columns=['Four_Digits'], inplace=True)   

In [122]:
train_data_2018 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2016.csv', sep=',', low_memory=False)
test_data_2018 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2016.csv', sep=',', low_memory=False)

dfs = [train_data_2018, test_data_2018]
feature_treatment_2018(dfs)

# 2019

In [123]:
def feature_treatment_2019(list_df: list[pd.DataFrame]) -> None:
    for df in list_df:
        # Suppression de colonnes
        df.drop(columns=['adr', 'num_veh', 'id_vehicule_x', 'id_vehicule_y', 'voie', 'v1', 'locp'], inplace=True)
        
        # sexe/agg : 2 -> 1 
        df.loc[df['sexe'] == 2, 'sexe'] = 0
        df.loc[df['agg'] == 2, 'agg'] = 0
        
        # Place : si pd.NA alors 0
        df['place'] = df['place'].fillna(10)
        
        # trajet : 0, -1 -> pd.NA 
        df['trajet'] = df['trajet'].replace({0: pd.NA, -1: pd.NA})
        df['actp'] = df['actp'].replace({0: pd.NA, -1: pd.NA})
        df['etatp'] = df['etatp'].replace({0: pd.NA, -1: pd.NA})
        df['senc'] = df['senc'].replace({0: pd.NA, -1: pd.NA})
        df['manv'] = df['manv'].replace({0: pd.NA, -1: pd.NA})
        df['motor'] = df['motor'].replace({0: pd.NA, -1: pd.NA})
        df['vma'] = df['vma'].replace({560: 50, 700: 70, 800: 80, 600: 60, -1: pd.NA, 5: 50, 3: 30, 4: 40, 1: 10, 0 : pd.NA, 2: 20, 12: 120, 6: 60})
        
        # -1 -> pd.NA
        df['choc'] = df['choc'].replace({-1: pd.NA})
        df['obs'] = df['obs'].replace({-1: pd.NA})
        df['obsm'] = df['obsm'].replace({-1: pd.NA})
        df['secu1'] = df['secu1'].replace({-1: pd.NA})
        df['secu2'] = df['secu2'].replace({-1: pd.NA})
        df['secu3'] = df['secu3'].replace({-1: pd.NA})
        df['circ'] = df['circ'].replace({-1: pd.NA})
        df['atm'] = df['atm'].replace({-1: pd.NA})
        df['col'] = df['col'].replace({-1: pd.NA})
        df['vosp'] = df['vosp'].replace({-1: pd.NA})
        df['prof'] = df['prof'].replace({-1: pd.NA})
        df['plan'] = df['plan'].replace({-1: pd.NA})
        df['surf'] = df['surf'].replace({-1: pd.NA})
        df['infra'] = df['infra'].replace({-1: pd.NA})
        df['situ'] = df['situ'].replace({-1: pd.NA})
        
        # 0 -> pd.NA
        df['catv'] = df['catv'].replace({0: pd.NA})
                
        # lat/long en mètres, normalisation
        df['lat'] = df['lat'].str.replace(',', '.').astype(float)
        df['long'] = df['long'].str.replace(',', '.').astype(float)
        df.loc[df['lat'].notna(), 'lat'] /= 100
        df.loc[df['long'].notna(), 'long'] /= 100
        
        # nbv : si 4 ou + → devient/reste 4
        df.loc[df['nbv'] >= 4, 'nbv'] = 4
        df.loc[df['nbv'] <= 0, 'nbv'] = pd.NA       
        
        # Remove paranthesis
        df['pr'] = df['pr'].str.replace(r'\(|\)', '')
        df['pr1'] = df['pr1'].str.replace(r'\(|\)', '')
        df['pr'] = df['pr'].replace('-1', pd.NA)
        df['pr1'] = df['pr1'].replace('-1', pd.NA)
        
        # an : 12 -> 2012, ...
        df['an'] = df['Four_Digits']
        
        # hrmn : 
        def map_to_simple_value(value: int, list_cat: list[int]) -> int:
            """
            Function to map values to simple values based on intervals
            :param list_cat: list of the intervals
            :param value: value to map
            :return: mapped value
            """
            for i, interval in enumerate(list_cat):
                if interval[0] <= value < interval[1]:
                    return simple_values[i]
            return None       
        
        df['hrmn'] = df['hrmn'].apply(convert_to_int)
        convert_float_to_int(df, 'hrmn')
        intervals = generate_intervals(0, 2400, 100)
        simple_values = list(range(len(intervals)))
        df['hrmn'] = df['hrmn'].apply(map_to_simple_value, list_cat=intervals)
        
        # Suppression Four_Digits
        df.drop(columns=['Four_Digits'], inplace=True)  

In [124]:
train_data_2019 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2019.csv', sep=',', low_memory=False)
test_data_2019 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2019.csv', sep=',', low_memory=False)

dfs = [train_data_2019, test_data_2019]
feature_treatment_2019(dfs)

# 2020 - 2021

In [131]:
def feature_treatment_2020_to_2021(list_df: list[pd.DataFrame]) -> None:
    for df in list_df:
        # Suppression de colonnes
        df.drop(columns=['adr', 'num_veh', 'id_vehicule_x', 'id_vehicule_y', 'locp', 'voie', 'v1'], inplace=True)
        
        # VMA
        df['vma'] = df['vma'].replace({-1: pd.NA, 1:10, 2:20, 5:50, 3:30, 6:60, 300:30, 7:70, 700: 70, 8: 80, 900: 90, 520: pd.NA, 901 : pd.NA, 9: 90, 501: pd.NA, 770: pd.NA, 12: 120})
        
        # sexe/agg : 2 -> 1 
        df.loc[df['sexe'] == 2, 'sexe'] = 0
        df.loc[df['agg'] == 2, 'agg'] = 0
        
        # an : 12 -> 2012, ...
        df['an'] = df['Four_Digits']
        
        # lartpc/larrout : si > 100 (+ 100cm) conversion en m sinon conservation
        df['larrout'] = df['larrout'].str.replace(',', '.').astype(float)
        df.loc[df['larrout'] >= 10, 'larrout'] /= 10
        
        # Place : si -1 pd.NA
        df['place'] = df['place'].replace({-1: pd.NA})
        df['sexe'] = df['sexe'].replace({-1: pd.NA})
        df['obs'] = df['obs'].replace({-1: pd.NA})
        df['obsm'] = df['obsm'].replace({-1: pd.NA})
        df['choc'] = df['choc'].replace({-1: pd.NA})
        df['lum'] = df['lum'].replace({-1: pd.NA})
        df['int'] = df['int'].replace({-1: pd.NA})
        df['atm'] = df['atm'].replace({-1: pd.NA})
        df['col'] = df['col'].replace({-1: pd.NA})
        df['circ'] = df['circ'].replace({-1: pd.NA})
        df['vosp'] = df['vosp'].replace({-1: pd.NA})
        df['prof'] = df['prof'].replace({-1: pd.NA})
        df['plan'] = df['plan'].replace({-1: pd.NA})
        df['surf'] = df['surf'].replace({-1: pd.NA})
        df['infra'] = df['infra'].replace({-1: pd.NA})
        df['situ'] = df['situ'].replace({-1: pd.NA})
        df['secu1'] = df['secu1'].replace({-1: pd.NA})
        df['secu2'] = df['secu2'].replace({-1: pd.NA})
        df['secu3'] = df['secu3'].replace({-1: pd.NA})
        
        # trajet : 0, -1 -> pd.NA 
        df['trajet'] = df['trajet'].replace({0: pd.NA, -1: pd.NA})
        df['larrout'] = df['larrout'].replace({0: pd.NA, -1: pd.NA})
        df['actp'] = df['actp'].replace({0: pd.NA, -1: pd.NA})
        df['etatp'] = df['etatp'].replace({0: pd.NA, -1: pd.NA})
        df['senc'] = df['senc'].replace({0: pd.NA, -1: pd.NA})
        df['catv'] = df['catv'].replace({0: pd.NA, -1: pd.NA})
        df['manv'] = df['manv'].replace({0: pd.NA, -1: pd.NA})
        df['com'] = df['com'].replace({0: pd.NA, -1: pd.NA})
        df['dep'] = df['dep'].replace({0: pd.NA, -1: pd.NA})
        df['motor'] = df['motor'].replace({0: pd.NA, -1: pd.NA})
        
        # hrmn : 
        def map_to_simple_value(value: int, list_cat: list[int]) -> int:
            """
            Function to map values to simple values based on intervals
            :param list_cat: list of the intervals
            :param value: value to map
            :return: mapped value
            """
            for i, interval in enumerate(list_cat):
                if interval[0] <= value < interval[1]:
                    return simple_values[i]
            return None 
        
        
        df['hrmn'] = df['hrmn'].replace({0: pd.NA, -1: pd.NA})
        df['hrmn'] = df['hrmn'].apply(convert_to_int)
        convert_float_to_int(df, 'hrmn')
        intervals = generate_intervals(0, 2400, 100)
        simple_values = list(range(len(intervals)))
        df['hrmn'] = df['hrmn'].apply(map_to_simple_value, list_cat=intervals)
        
        # lat/long en mètres, normalisation
        df['lat'] = df['lat'].str.replace(',', '.').astype(float)
        df['long'] = df['long'].str.replace(',', '.').astype(float)
        df.loc[df['lat'].notna(), 'lat'] /= 100
        df.loc[df['long'].notna(), 'long'] /= 100
        
        # nbv : si 4 ou + → devient/reste 4
        df.loc[df['nbv'] >= 4, 'nbv'] = 4
        df.loc[df['nbv'] <= 0, 'nbv'] = pd.NA   
        
        # Remove paranthesis
        df['pr'] = df['pr'].str.replace(r'\(|\)', '')
        df['pr1'] = df['pr1'].str.replace(r'\(|\)', '')
        df['pr'] = df['pr'].replace('-1', pd.NA)
        df['pr1'] = df['pr1'].replace('-1', pd.NA)
        
        # Suppression Four_Digits
        df.drop(columns=['Four_Digits'], inplace=True)  

In [132]:
train_data_2020_21= pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2020_21.csv', sep=',', low_memory=False)
test_data_2020_21 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2020_21.csv', sep=',', low_memory=False)

dfs = [test_data_2020_21, train_data_2020_21]
feature_treatment_2020_to_2021(dfs)

# 2022

In [144]:
def feature_treatment_2022(list_df: list[pd.DataFrame]) -> None:
    for df in list_df:
        # Suppression de colonnes
        df.drop(columns=['adr', 'num_veh', 'id_vehicule_x', 'id_vehicule_y', 'locp', 'voie', 'v1'], inplace=True)
        
        # VMA
        df['vma'] = df['vma'].replace({-1: pd.NA, 1:10, 2:20, 5:50, 3:30, 6:60, 300:30, 7:70, 700: 70, 8: 80, 900: 90, 520: pd.NA, 901 : pd.NA, 9: 90, 501: pd.NA, 770: pd.NA, 12: 120})
        
        # sexe/agg : 2 -> 1 
        df.loc[df['sexe'] == 2, 'sexe'] = 0
        df.loc[df['agg'] == 2, 'agg'] = 0
        
        # an : 12 -> 2012, ...
        df['an'] = df['Four_Digits']
        
        # lartpc/larrout : si > 100 (+ 100cm) conversion en m sinon conservation
        df['larrout'] = df['larrout'].str.replace(',', '.').astype(float)
        df.loc[df['larrout'] >= 10, 'larrout'] /= 10
        
        # Place : si -1 pd.NA
        df['place'] = df['place'].replace({-1: pd.NA})
        df['sexe'] = df['sexe'].replace({-1: pd.NA})
        df['obs'] = df['obs'].replace({-1: pd.NA})
        df['obsm'] = df['obsm'].replace({-1: pd.NA})
        df['choc'] = df['choc'].replace({-1: pd.NA})
        df['lum'] = df['lum'].replace({-1: pd.NA})
        df['int'] = df['int'].replace({-1: pd.NA})
        df['atm'] = df['atm'].replace({-1: pd.NA})
        df['col'] = df['col'].replace({-1: pd.NA})
        df['circ'] = df['circ'].replace({-1: pd.NA})
        df['vosp'] = df['vosp'].replace({-1: pd.NA})
        df['prof'] = df['prof'].replace({-1: pd.NA})
        df['plan'] = df['plan'].replace({-1: pd.NA})
        df['surf'] = df['surf'].replace({-1: pd.NA})
        df['infra'] = df['infra'].replace({-1: pd.NA})
        df['situ'] = df['situ'].replace({-1: pd.NA})
        df['secu1'] = df['secu1'].replace({-1: pd.NA})
        df['secu2'] = df['secu2'].replace({-1: pd.NA})
        df['secu3'] = df['secu3'].replace({-1: pd.NA})
        
        # trajet : 0, -1 -> pd.NA 
        df['trajet'] = df['trajet'].replace({0: pd.NA, -1: pd.NA})
        df['larrout'] = df['larrout'].replace({0: pd.NA, -1: pd.NA})
        df['actp'] = df['actp'].replace({0: pd.NA, -1: pd.NA})
        df['etatp'] = df['etatp'].replace({0: pd.NA, -1: pd.NA})
        df['senc'] = df['senc'].replace({0: pd.NA, -1: pd.NA})
        df['catv'] = df['catv'].replace({0: pd.NA, -1: pd.NA})
        df['manv'] = df['manv'].replace({0: pd.NA, -1: pd.NA})
        df['com'] = df['com'].replace({0: pd.NA, -1: pd.NA})
        df['dep'] = df['dep'].replace({0: pd.NA, -1: pd.NA})
        df['motor'] = df['motor'].replace({0: pd.NA, -1: pd.NA})
        
        # hrmn : 
        def map_to_simple_value(value: int, list_cat: list[int]) -> int:
            """
            Function to map values to simple values based on intervals
            :param list_cat: list of the intervals
            :param value: value to map
            :return: mapped value
            """
            for i, interval in enumerate(list_cat):
                if interval[0] <= value < interval[1]:
                    return simple_values[i]
            return None 
        
        
        df['hrmn'] = df['hrmn'].replace({0: pd.NA, -1: pd.NA})
        df['hrmn'] = df['hrmn'].apply(convert_to_int)
        convert_float_to_int(df, 'hrmn')
        intervals = generate_intervals(0, 2400, 100)
        simple_values = list(range(len(intervals)))
        df['hrmn'] = df['hrmn'].apply(map_to_simple_value, list_cat=intervals)
        
        # lat/long en mètres, normalisation
        df['lat'] = df['lat'].str.replace(',', '.').astype(float)
        df['long'] = df['long'].str.replace(',', '.').astype(float)
        df.loc[df['lat'].notna(), 'lat'] /= 100
        df.loc[df['long'].notna(), 'long'] /= 100
        
        # nbv : si 4 ou + → devient/reste 4
        df['nbv'] = df['nbv'].replace({"#ERREUR" : 0})
        df['nbv'] = df['nbv'].astype(float)
        df.loc[df['nbv'] >= 4, 'nbv'] = 4
        df.loc[df['nbv'] <= 0, 'nbv'] = pd.NA   
        
        # Remove paranthesis
        df['pr'] = df['pr'].str.replace(r'\(|\)', '')
        df['pr1'] = df['pr1'].str.replace(r'\(|\)', '')
        df['pr'] = df['pr'].replace('-1', pd.NA)
        df['pr1'] = df['pr1'].replace('-1', pd.NA)
        
        # Suppression Four_Digits
        df.drop(columns=['Four_Digits'], inplace=True)  

In [145]:
train_data_2022 = pd.read_csv('Filtered_Data/TRAIN/DRAFT/train_data_filtered_2022.csv', sep=',', low_memory=False)
test_data_2022 = pd.read_csv('Filtered_Data/TEST/DRAFT/test_data_filtered_2022.csv', sep=',', low_memory=False)

dfs = [test_data_2022, train_data_2022]
feature_treatment_2022(dfs)

In [148]:
for year in range(2012, 2023):
    test_data_var_name = f"test_data_{year}"
    train_data_var_name = f"train_data_{year}"
    
    if test_data_var_name in globals():
        test_data = globals()[test_data_var_name]
        test_data.to_csv(f"Filtered_Data/TEST/DRAFT_2/{test_data_var_name}.csv", index=False)
        
    if train_data_var_name in globals():
        train_data = globals()[train_data_var_name]
        train_data.to_csv(f"Filtered_Data/TRAIN/DRAFT_2/{train_data_var_name}.csv", index=False)

In [149]:
# TRAIN/TEST 2012/2013/2014/2015
train_data_2012_2015.to_csv('Filtered_Data/TRAIN/DRAFT_2/train_data_2012_to_2015.csv', index=False)
test_data_2012_2015.to_csv('Filtered_Data/TEST/DRAFT_2/test_data_2012_2015.csv', index=False)

# TRAIN 2020/2021
train_data_2020_21.to_csv('Filtered_Data/TRAIN/DRAFT_2/train_data_2020_2021.csv', index=False)
test_data_2020_21.to_csv('Filtered_Data/TEST/DRAFT_2/test_data_2020_2021.csv', index=False)