# Import Dataset via the libary "datasets"

In [27]:
from datasets import load_dataset
import pandas as pd

# load dataset from huggingface using the package datasets
ds = load_dataset("yuvidhepe/us-accidents-updated")

# limit the number of lines in the data set at an early stage
small_ds = ds['train'].select(range(100))

# copying the dataset to panda
TrafficAccidents_preprocessed = small_ds.to_pandas()

# Calculate the impact on traffic in seconds

In [28]:
# Convert ‘Start_Time’ and ‘End_Time’ to datetime format
TrafficAccidents_preprocessed['Start_Time'] = pd.to_datetime(TrafficAccidents_preprocessed['Start_Time'])
TrafficAccidents_preprocessed['End_Time'] = pd.to_datetime(TrafficAccidents_preprocessed['End_Time'])

# Calculate the difference in seconds and add it as a new column
TrafficAccidents_preprocessed['Duration_Seconds'] = (TrafficAccidents_preprocessed['End_Time'] - TrafficAccidents_preprocessed['Start_Time']).dt.total_seconds()

# Delete all discussed columns from the data set according to the report

In [29]:
# List of columns to be removed
columns_to_drop = [
    'ID', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Description',
    'City', 'County', 'State', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
    'Wind_Chill(F)', 'Precipitation(in)', 'Bump', 'Roundabout', 'Station', 'Turning_Loop',
    'Sunrise_Sunset', 'Nautical_Twilight', 'Astronomical_Twilight', 'Source'
]

# Drop the specified columns
TrafficAccidents_preprocessed = TrafficAccidents_preprocessed.drop(columns=columns_to_drop)


# Normalisation of columns with numerical data

In [30]:
from sklearn.preprocessing import StandardScaler

# Columns to be scaled
columns_to_scale = ['Distance(mi)', 'Temperature(F)', 'Humidity(%)', 
                    'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Duration_Seconds']

# Instance of the StandardScaler
scaler = StandardScaler()

# Apply the StandardScaler to the defined columns
TrafficAccidents_preprocessed[columns_to_scale] = scaler.fit_transform(TrafficAccidents_preprocessed[columns_to_scale])

# Encoding...