# Importing the Dataset and copying it to Pandas Dataframe

In [1]:
import pandas as pd
from datasets import load_dataset

# load dataset from huggingface using the package datasets
ds = load_dataset("yuvidhepe/us-accidents-updated")

# copying the dataset to panda
Traffic_Accidents = ds['train'].to_pandas()

# Calculate the impact on traffic in seconds

In [2]:
# Convert ‘Start_Time’ and ‘End_Time’ to datetime format

# remove some errors in the timestamp
to_remove: list =  [".000000000", ".000000"]

for elem in to_remove:
    Traffic_Accidents["Start_Time"] = Traffic_Accidents["Start_Time"].str.replace(elem, "")
    Traffic_Accidents["End_Time"] = Traffic_Accidents["End_Time"].str.replace(elem, "")


Traffic_Accidents['Start_Time'] = pd.to_datetime(Traffic_Accidents['Start_Time'], format='mixed')
Traffic_Accidents['End_Time'] = pd.to_datetime(Traffic_Accidents['End_Time'], format='mixed')

# Calculate the difference in seconds and add it as a new column
Traffic_Accidents['Duration_Seconds'] = (Traffic_Accidents['End_Time'] - Traffic_Accidents['Start_Time']).dt.total_seconds()

# Converting Latitude and Longitude to H3 Index

In [3]:
import h3

resolution = 7

Traffic_Accidents['lat_lng'] = Traffic_Accidents.apply(
    lambda row: h3.latlng_to_cell(row['Start_Lat'], row['Start_Lng'], resolution),
    axis=1
)

# Clustering H3 Index 

In [4]:
from sklearn.cluster import KMeans

# Using aggregated features for clustering
h3_features = Traffic_Accidents.groupby('lat_lng').agg(
    accident_count=('Severity', 'size'),
    avg_severity=('Severity', 'mean')
).reset_index()

# Apply KMeans clustering on the aggregated features
num_clusters = 10000  # Choose a suitable number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
h3_features['cluster'] = kmeans.fit_predict(h3_features[['accident_count', 'avg_severity']])

# Merge cluster labels back to the main DataFrame
Traffic_Accidents = Traffic_Accidents.merge(h3_features[['lat_lng', 'cluster']], on='lat_lng', how='left')


# Delete all discussed columns from the data set according to the report

In [5]:
# List of columns to be removed
columns_to_drop = [
    'ID', 'Start_Lat', 'Street', 'Zipcode', 'End_Lng', 'Description',
    'City', 'County', 'State', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
    'Wind_Chill(F)', 'Precipitation(in)', 'Bump', 'Roundabout', 'Station', 'Turning_Loop',
    'Sunrise_Sunset', 'Nautical_Twilight', 'Astronomical_Twilight', 'Source', 'Start_Time',
    'End_Time', 'lat_lng', 'Start_Lng', 'End_Lat'
]

# Drop the specified columns
Traffic_Accidents = Traffic_Accidents.drop(columns=columns_to_drop)


# Drop the rows with empty cells and remove all Duplicate cells

In [6]:
# Drop Empty Cells
Traffic_Accidents = Traffic_Accidents.dropna()

# Drop Duplicate Rows
Traffic_Accidents = Traffic_Accidents.drop_duplicates()

# Encoding

In [7]:
# Encoding Columns with Boolean Values
to_bool_encode = ['Amenity', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Stop', 'Traffic_Calming', 'Traffic_Signal']

Traffic_Accidents[to_bool_encode] = Traffic_Accidents[to_bool_encode].astype(int)

# Encoding the column with 2 unique values
Traffic_Accidents['Civil_Twilight'] = Traffic_Accidents['Civil_Twilight'].map({'Day': 1, 'Night': 0})

# Encoding all the remaining columns
to_encode: list = ["Wind_Direction", "Weather_Condition"]

Traffic_Accidents[to_encode] = Traffic_Accidents[to_encode].apply(lambda col:pd.Categorical(col).codes)

# Train-Validation-Test Split

In [8]:
from sklearn.model_selection import train_test_split 

X = Traffic_Accidents.drop('Severity', axis=1)

# Target Variable
y = Traffic_Accidents['Severity']

# Splitting into train and temp 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Splitting temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Resampling

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

# Downsample the majority class
df_train = pd.concat([X_train, y_train], axis=1)

# Separate the classes in the training set
df_majority = df_train[df_train['Severity'] == 2]
df_minority_1 = df_train[df_train['Severity'] == 1]
df_minority_3 = df_train[df_train['Severity'] == 3]
df_minority_4 = df_train[df_train['Severity'] == 4]

# Downsample the majority class (for example, to 500,000)
df_majority_downsampled = resample(df_majority,
                                    replace=False,    
                                    n_samples=500000, 
                                    random_state=42)

# Combine the downsampled majority class with the original minority classes
df_combined = pd.concat([df_majority_downsampled, df_minority_1, df_minority_3, df_minority_4])

# Step 2: Upsample minority classes using SMOTE
X_combined = df_combined.drop('Severity', axis=1)
y_combined = df_combined['Severity']

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

# Normalization

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_normal = scaler.fit_transform(X_resampled)

X_val_normal = scaler.transform(X_val)

X_test_normal = scaler.transform(X_test)

# Anomaly Detection

In [11]:
from sklearn.ensemble import IsolationForest

# Initialize the Isolation Forest model
model = IsolationForest(contamination='auto', random_state=42)

# Fit the model on the normalized resampled training data
model.fit(X_normal)

# Predict anomalies on the normalized training data
anomalies_predictions = model.predict(X_normal)

# Convert to a DataFrame
X_normal_df = pd.DataFrame(X_normal, columns=X_combined.columns)

# Add the anomaly predictions to the DataFrame
X_normal_df['anomaly'] = anomalies_predictions

# Extract anomalies
anomalies = X_normal_df[X_normal_df['anomaly'] == -1]

# Display the detected anomalies
print("Detected anomalies:")
print(anomalies)

Detected anomalies:
         Distance(mi)  Temperature(F)  Humidity(%)  Pressure(in)  \
17          -0.320130       -0.673885     1.075413     -0.089652   
19           0.805101       -1.066063    -0.694228     -0.834971   
26          -0.320130        0.110470     1.119654      0.655667   
35           1.122937       -2.186570     1.252377     -4.472129   
40           0.049926       -0.673885     1.252377      0.417165   
...               ...             ...          ...           ...   
3158535      0.929846       -2.658199     1.095526     -7.540080   
3158571      3.989056       -1.784952     1.184281     -2.208368   
3158600      0.855143        0.291408     1.173784      0.389741   
3158611      4.406141        0.321023     1.292506      0.615917   
3158616      7.629597       -2.150002    -0.487849     -2.339243   

         Visibility(mi)  Wind_Direction  Wind_Speed(mph)  Weather_Condition  \
17             0.330375       -1.576502        -1.569330           1.331650   
19   

# Converting Train-Val-Test Split to Dataframe and Exporting as csv

In [12]:
train_set_df = pd.DataFrame(X_normal, columns=X_combined.columns)
train_set_df['Severity'] = y_resampled.values

In [13]:
val_set_df = pd.DataFrame(X_val_normal, columns=X_val.columns)
val_set_df['Severity'] = y_val.values

In [14]:
test_set_df = pd.DataFrame(X_test_normal, columns=X_test.columns)
test_set_df['Severity'] = y_test.values

In [15]:
train_set_df.to_csv('Train_Set.csv', index=False)
val_set_df.to_csv('Validation_Set.csv', index=False)
test_set_df.to_csv('Test_Set.csv', index=False)