In [1]:
import pandas as pd
import numpy as np
import os
from utils import mapping
from sklearn.model_selection import train_test_split

In [2]:
cols_to_keep = [
    "發生日期", 
    "發生時間", 
    "事故類別名稱",
    "天候名稱", 
    "光線名稱", 
    "速限-第1當事者", 
    "路面狀況-路面狀態名稱", 
    "號誌-號誌種類名稱", 
    "當事者區分-類別-大類別名稱-車種", 
    "當事者屬-性-別名稱", 
    "當事者事故發生時年齡"
]

In [None]:

vehicle_types = ["Scooter", "Bus", "Pedestrian", "Truck", "Car", "Slow", "Small_Truck", "Full_trailer", "Tractor", "Semi_trailer", "Other", "Special", "Military"]
vehicle_columns = [f"veh_{vt}" for vt in vehicle_types]
genders = ["男", "女"]

for i in [1, 2]:
    
    df = pd.read_csv(
        f"raw_data/A{i}_raw.csv", 
        encoding="utf-8",
        low_memory=False)
    
    df = df[cols_to_keep]
    df = df.dropna(how='any')

    df = mapping(df)

    df = df[df["當事者區分-類別-大類別名稱-車種"].isin(vehicle_types)]
    df = df[df["當事者屬-性-別名稱"].isin(genders)]

    # one-hot encoding
    vehicle_dummies = pd.get_dummies(df["當事者區分-類別-大類別名稱-車種"], prefix="veh")
    vehicle_dummies = vehicle_dummies.reindex(columns=vehicle_columns, fill_value=0)

    df["male"] = df["當事者屬-性-別名稱"].apply(lambda x: 1 if x == "男" else 0)
    df["female"] = df["當事者屬-性-別名稱"].apply(lambda x: 1 if x == "女" else 0)

    bins = range(0, 121, 10) 
    labels = [f"{i+1}~{i+10}" for i in bins[:-1]] 

    df["age_bin"] = pd.cut(
        df["當事者事故發生時年齡"],
        bins=bins,
        labels=labels,
        right=True,   
        include_lowest=True
    )

    age_dummies = pd.get_dummies(df["age_bin"], prefix="age")

    df = pd.concat([df, vehicle_dummies, age_dummies], axis=1)

    # group same accident
    agg_dict = {
        "天候名稱": "first",
        "光線名稱": "first",
        "速限-第1當事者": "first",
        "路面狀況-路面狀態名稱": "first",
        "號誌-號誌種類名稱": "first",
        "male": "sum",
        "female": "sum",
    }

    for col in vehicle_dummies.columns:
        agg_dict[col] = "sum"

    for col in age_dummies.columns:
        agg_dict[col] = "sum"


    df_grouped = df.groupby(["發生日期", "發生時間"], as_index=False).agg(agg_dict)

    
    df_grouped = df_grouped.rename(columns={
        "發生日期": "date", 
        "發生時間": "time",
        "事故類別名稱": "category",
        "天候名稱": "weather",
        "光線名稱": "lighting",
        "速限-第1當事者": "speed_limit",
        "路面狀況-路面狀態名稱": "road_condition",
        "號誌-號誌種類名稱": "traffic_signal",
        "當事者區分-類別-大類別名稱-車種": "vehicle_type",
        "當事者屬-性-別名稱": "gender",
        "當事者事故發生時年齡": "age",
    })

    df_grouped.to_csv(f"raw_data/A{i}.csv", index=False, encoding="utf-8-sig")


safdsaffdsaf
         發生日期      發生時間  天候名稱  光線名稱  速限-第1當事者  路面狀況-路面狀態名稱  號誌-號誌種類名稱  male  \
0  20190101.0   12800.0     0     4      60.0            0          0     1   
1  20190101.0   15400.0     2     4      40.0            1          0     1   
2  20190101.0   93800.0     1     3      60.0            0          0     1   
3  20190101.0  161254.0     2     3      50.0            1          0     1   
4  20190101.0  221600.0     0     4      50.0            1          0     1   

   female  veh_Scooter  ...  age_21~30  age_31~40  age_41~50  age_51~60  \
0       1            1  ...          0          0          0          2   
1       0            1  ...          1          0          0          0   
2       0            1  ...          0          0          1          0   
3       0            0  ...          1          0          0          0   
4       0            1  ...          0          1          0          0   

   age_61~70  age_71~80  age_81~90  age_91~100  age_101~110  

In [None]:
os.makedirs("dataset/train", exist_ok=True)
os.makedirs("dataset/test", exist_ok=True)

for i in [1, 2]:
    df = pd.read_csv(f"raw_data/A{i}.csv")
    print(df.shape)

    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

    train_df.to_csv(f"dataset/train/A{i}.csv", index=False)
    test_df.to_csv(f"dataset/test/A{i}.csv", index=False)

    print(f"Dataset split into train and test sets and saved to 'dataset/train/A{i}.csv' and 'dataset/test/A{i}.csv'.")


(10399, 34)
Dataset split into train and test sets and saved to 'dataset/train/A1.csv' and 'dataset/test/A1.csv'.
(1583818, 34)
Dataset split into train and test sets and saved to 'dataset/train/A2.csv' and 'dataset/test/A2.csv'.
