# Preprocessing

In [1]:
import time
import pandas as pd
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
def evaluate_model(predictions, true_labels):
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average="weighted")
    recall = recall_score(true_labels, predictions, average="weighted")
    f1 = f1_score(true_labels, predictions, average="weighted")
    confusion = confusion_matrix(true_labels, predictions)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "confusion_matrix": confusion
    }

In [3]:
start = time.time()

df = pd.read_csv("./data/accidents_engineered_ordinal.csv")

end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print(f"Loaded dataset in {round(total_time, 2)}s")

Loaded dataset in 12.07s


### Standardization (StandardScaler())
- `nbv`
- `lartpc`
- `larrout`
- `vma`
- `occutc`?
- `age`?

In [4]:
columns_to_standardize = ["nbv", "age"]

# Instantiate the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the columns you want to standardize
scaler.fit(df[columns_to_standardize])

# Transform the selected columns to their standardized values
df[columns_to_standardize] = scaler.transform(df[columns_to_standardize])

## Scaling (MinMaxScaler())

In [5]:
columns_to_scale = ["nbv", "age"]

# Instantiate the MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler on the columns you want to scale
scaler.fit(df[columns_to_scale])

# Transform the selected columns to their scaled values
df[columns_to_scale] = scaler.transform(df[columns_to_scale])

## Drop rows with missing values

In [6]:
df = df.dropna().reset_index(drop=True)

## Encode final variable

In [7]:
mapping = "ordinal"
if mapping == "ordinal":
    grav_order = [1, 2, 3, 4]
    df["grav"] = pd.Categorical(df["grav"], categories=grav_order, ordered=True)
if mapping == "binary":
    df["grav"] = pd.Categorical(df["grav"])

## Convert columns to categorical

In [8]:
cat_columns = ["col", "prof", "plan", "situ", "obs", "choc", "manv",
               "lum", "agg", "catr", "circ", "catv", "catu", "sexe"]

for col in cat_columns:
    df[col] = df[col].astype("category")

## Save

In [9]:
start = time.time()

df.to_csv("./data/accidents_preprocessed.csv", index=False)

end = time.time()

total_time = end - start
print(f"Wrote engineered dataset in {round(total_time, 2)}s")
print(f"There are {len(df):,} rows and {len(df.columns)} columns in the final dataset that will be used for modeling.")

Wrote engineered dataset in 105.31s
There are 2,355,812 rows and 23 columns in the final dataset that will be used for modeling.


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2355812 entries, 0 to 2355811
Data columns (total 23 columns):
 #   Column         Dtype   
---  ------         -----   
 0   col            category
 1   lum            category
 2   agg            category
 3   catr           category
 4   circ           category
 5   nbv            float64 
 6   prof           category
 7   plan           category
 8   situ           category
 9   catv           category
 10  obs            category
 11  choc           category
 12  manv           category
 13  catu           category
 14  sexe           category
 15  grav           category
 16  age            float64 
 17  month_sin      float64 
 18  month_cos      float64 
 19  DayOfWeek_sin  float64 
 20  DayOfWeek_cos  float64 
 21  TimeOfDay_sin  float64 
 22  TimeOfDay_cos  float64 
dtypes: category(15), float64(8)
memory usage: 177.5 MB


In [11]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"Last run at {current_time}")

Last run at 18:39:06
