# Importing the Dataset and copying it to Pandas Dataframe

In [1]:
import pandas as pd
from datasets import load_dataset

# Load dataset from huggingface
ds = load_dataset("yuvidhepe/us-accidents-updated")

Traffic_Accidents = ds['train'].to_pandas()

# Calculate the impact on traffic in seconds

In [2]:
# Remove some errors in the timestamp
to_remove: list =  [".000000000", ".000000"]

for elem in to_remove:
    Traffic_Accidents["Start_Time"] = Traffic_Accidents["Start_Time"].str.replace(elem, "")
    Traffic_Accidents["End_Time"] = Traffic_Accidents["End_Time"].str.replace(elem, "")

# Convert ‘Start_Time’ and ‘End_Time’ to datetime format
Traffic_Accidents['Start_Time'] = pd.to_datetime(Traffic_Accidents['Start_Time'], format='mixed')
Traffic_Accidents['End_Time'] = pd.to_datetime(Traffic_Accidents['End_Time'], format='mixed')

# Calculate the difference in seconds and add it as a new column
Traffic_Accidents['Duration_Seconds'] = (Traffic_Accidents['End_Time'] - Traffic_Accidents['Start_Time']).dt.total_seconds()

# Cleaning 'Wind_Direction'

In [3]:
Traffic_Accidents.loc[Traffic_Accidents['Wind_Direction']=='Calm', 'Wind_Direction'] = 'CALM'
Traffic_Accidents.loc[Traffic_Accidents['Wind_Direction']=='Variable', 'Wind_Direction'] = 'VAR'
Traffic_Accidents.loc[Traffic_Accidents['Wind_Direction']=='North', 'Wind_Direction'] = 'N'
Traffic_Accidents.loc[Traffic_Accidents['Wind_Direction']=='East', 'Wind_Direction'] = 'E'
Traffic_Accidents.loc[Traffic_Accidents['Wind_Direction']=='South', 'Wind_Direction'] = 'S'
Traffic_Accidents.loc[Traffic_Accidents['Wind_Direction']=='West', 'Wind_Direction'] = 'W'

# Converting Latitude and Longitude to H3 Index

In [4]:
import h3

resolution = 7

Traffic_Accidents['lat_lng'] = Traffic_Accidents.apply(
    lambda row: h3.latlng_to_cell(row['Start_Lat'], row['Start_Lng'], resolution),
    axis=1
)

# Clustering H3 Index 

In [5]:
from sklearn.cluster import KMeans

# Using aggregated features for clustering
h3_features = Traffic_Accidents.groupby('lat_lng').agg(
    accident_count=('Severity', 'size'),
    avg_severity=('Severity', 'mean')
).reset_index()

# Apply KMeans clustering on the aggregated features
num_clusters = 10000  
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
h3_features['Cluster'] = kmeans.fit_predict(h3_features[['accident_count', 'avg_severity']])

# Merge cluster labels back to the main DataFrame
Traffic_Accidents = Traffic_Accidents.merge(h3_features[['lat_lng', 'Cluster']], on='lat_lng', how='left')


# Delete all discussed columns from the data set according to the report

In [6]:
# List of columns to be removed
columns_to_drop = [
    'ID', 'Start_Lat', 'Street', 'Zipcode', 'End_Lng', 'Description',
    'City', 'County', 'State', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
    'Wind_Chill(F)', 'Precipitation(in)', 'Bump', 'Roundabout', 'Station', 'Turning_Loop',
    'Sunrise_Sunset', 'Nautical_Twilight', 'Astronomical_Twilight', 'Source', 'Start_Time',
    'End_Time', 'lat_lng', 'Start_Lng', 'End_Lat'
]

# Drop the specified columns
Traffic_Accidents = Traffic_Accidents.drop(columns=columns_to_drop)

# Fill missing values of numerical data with their Mean

In [7]:
features_to_fill = ['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)']
Traffic_Accidents[features_to_fill] = Traffic_Accidents[features_to_fill].fillna(Traffic_Accidents[features_to_fill].mean())

# Drop the rows with empty cells and remove all Duplicate cells

In [8]:
# Drop Empty Cells
Traffic_Accidents = Traffic_Accidents.dropna()

# Drop Duplicate Rows
Traffic_Accidents = Traffic_Accidents.drop_duplicates()

# Encoding

In [9]:
# Encoding Columns with Boolean Values
to_bool_encode = ['Amenity', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Stop', 'Traffic_Calming', 'Traffic_Signal']

Traffic_Accidents[to_bool_encode] = Traffic_Accidents[to_bool_encode].astype(int)

# Encoding the column with 2 unique values
Traffic_Accidents['Civil_Twilight'] = Traffic_Accidents['Civil_Twilight'].map({'Day': 1, 'Night': 0})

# Encoding all the remaining columns
to_encode: list = ["Wind_Direction", "Weather_Condition"]

Traffic_Accidents[to_encode] = Traffic_Accidents[to_encode].apply(lambda col:pd.Categorical(col).codes)

# Train-Validation-Test Split

In [10]:
from sklearn.model_selection import train_test_split 

X = Traffic_Accidents.drop('Severity', axis=1)

# Target Variable
y = Traffic_Accidents['Severity']

# Splitting into train and temp 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Splitting temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Resampling

In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

# Downsample the majority class
df_train = pd.concat([X_train, y_train], axis=1)

# Separate the classes in the training set
df_majority = df_train[df_train['Severity'] == 2]
df_minority_1 = df_train[df_train['Severity'] == 1]
df_minority_3 = df_train[df_train['Severity'] == 3]
df_minority_4 = df_train[df_train['Severity'] == 4]

# Downsample the majority class to 500000
df_majority_downsampled = resample(df_majority,
                                    replace=False,    
                                    n_samples=500000, 
                                    random_state=42)

# Combine the downsampled majority class with the original minority classes
df_combined = pd.concat([df_majority_downsampled, df_minority_1, df_minority_3, df_minority_4])

# Upsample minority classes using SMOTE
X_combined = df_combined.drop('Severity', axis=1)
y_combined = df_combined['Severity']

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

# Normalization

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_normal = scaler.fit_transform(X_resampled)

X_val_normal = scaler.transform(X_val)

X_test_normal = scaler.transform(X_test)

# Anomaly Detection

In [13]:
from sklearn.ensemble import IsolationForest

# Initialize the Isolation Forest model
model = IsolationForest(contamination='auto', random_state=42)

model.fit(X_normal)

# Predict anomalies on the normalized training data
anomalies_predictions = model.predict(X_normal)

# Convert to a DataFrame
X_normal_df = pd.DataFrame(X_normal, columns=X_combined.columns)

# Add the anomaly predictions to the DataFrame
X_normal_df['anomaly'] = anomalies_predictions

# Extract anomalies
anomalies = X_normal_df[X_normal_df['anomaly'] == -1]

# Display the detected anomalies
print("Detected anomalies:")
print(anomalies)

Detected anomalies:
         Distance(mi)  Temperature(F)  Humidity(%)  Pressure(in)  \
9            0.592043       -1.906197     1.188614      0.415315   
10           0.245288        1.138515    -0.449308      0.577926   
13          -0.306301        1.927885    -2.485643     -0.824590   
21          -0.310203       -1.624279     1.188614     -0.580674   
22          -0.294109       -0.158307    -2.042961     -0.824590   
...               ...             ...          ...           ...   
3512315      3.269981       -0.230232     1.514524     -1.047528   
3512324      0.005548        1.763443    -2.194083      0.264566   
3512337      5.534320       -1.852588     1.008666     -0.343200   
3512349      0.012144       -2.007389     1.029717     -3.105357   
3512387      8.754662       -1.092606     0.545388     -0.448419   

         Visibility(mi)  Wind_Direction  Wind_Speed(mph)  Weather_Condition  \
9             -1.778025       -1.464518        -1.621729           1.325493   
10   

# Converting Train-Val-Test Split to Dataframe

In [14]:
train_set_df = pd.DataFrame(X_normal, columns=X_combined.columns)
train_set_df['Severity'] = y_resampled.values

val_set_df = pd.DataFrame(X_val_normal, columns=X_val.columns)
val_set_df['Severity'] = y_val.values

test_set_df = pd.DataFrame(X_test_normal, columns=X_test.columns)
test_set_df['Severity'] = y_test.values

# Exporting these as csv

In [None]:
train_set_df.to_csv('Train_Set.csv', index=False)
val_set_df.to_csv('Validation_Set.csv', index=False)
test_set_df.to_csv('Test_Set.csv', index=False)