# Importing the Dataset and copying it to Pandas Dataframe

In [None]:
import pandas as pd
from datasets import load_dataset

# load dataset from huggingface using the package datasets
ds = load_dataset("yuvidhepe/us-accidents-updated")

# copying the dataset to panda
Traffic_Accidents = ds['train'].to_pandas()

# Calculate the impact on traffic in seconds

In [None]:
# Convert ‘Start_Time’ and ‘End_Time’ to datetime format
Traffic_Accidents['Start_Time'] = pd.to_datetime(Traffic_Accidents['Start_Time'], format='mixed')
Traffic_Accidents['End_Time'] = pd.to_datetime(Traffic_Accidents['End_Time'], format='mixed')

# Calculate the difference in seconds and add it as a new column
Traffic_Accidents['Duration_Seconds'] = (Traffic_Accidents['End_Time'] - Traffic_Accidents['Start_Time']).dt.total_seconds()

# Delete all discussed columns from the data set according to the report

In [None]:
# List of columns to be removed
columns_to_drop = [
    'ID', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Description',
    'City', 'County', 'State', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
    'Wind_Chill(F)', 'Precipitation(in)', 'Bump', 'Roundabout', 'Station', 'Turning_Loop',
    'Sunrise_Sunset', 'Nautical_Twilight', 'Astronomical_Twilight', 'Source', 'Start_Time',
    'End_Time'
]

# Drop the specified columns
Traffic_Accidents = Traffic_Accidents.drop(columns=columns_to_drop)


# Drop the rows with empty cells and remove all Duplicate cells

In [None]:
# Drop Empty Cells
Traffic_Accidents = Traffic_Accidents.dropna()

# Drop Duplicate Rows
Traffic_Accidents = Traffic_Accidents.drop_duplicates()

# Encoding

In [None]:
# Encoding Columns with Boolean Values
to_bool_encode = ['Amenity', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Stop', 'Traffic_Calming', 'Traffic_Signal']

Traffic_Accidents[to_bool_encode] = Traffic_Accidents[to_bool_encode].astype(int)

# Encoding the column with 2 unique values
Traffic_Accidents['Civil_Twilight'] = Traffic_Accidents['Civil_Twilight'].map({'Day': 1, 'Night': 0})

# Encoding all the remaining columns
to_encode: list = ["Street", "Wind_Direction", "Weather_Condition", "Zipcode"]

Traffic_Accidents[to_encode] = Traffic_Accidents[to_encode].apply(lambda col:pd.Categorical(col).codes)

# Train-Validation-Test Split

In [None]:
from sklearn.model_selection import train_test_split 

X = Traffic_Accidents.drop('Severity', axis=1)

# Target Variable
y = Traffic_Accidents['Severity']

# Splitting into train and temp 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Splitting temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Resampling

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

# Downsample the majority class
df_train = pd.concat([X_train, y_train], axis=1)

# Separate the classes in the training set
df_majority = df_train[df_train['Severity'] == 2]
df_minority_1 = df_train[df_train['Severity'] == 1]
df_minority_3 = df_train[df_train['Severity'] == 3]
df_minority_4 = df_train[df_train['Severity'] == 4]

# Downsample the majority class (for example, to 200,000)
df_majority_downsampled = resample(df_majority,
                                    replace=False,    
                                    n_samples=2000000, 
                                    random_state=42)

# Combine the downsampled majority class with the original minority classes
df_combined = pd.concat([df_majority_downsampled, df_minority_1, df_minority_3, df_minority_4])

# Step 2: Upsample minority classes using SMOTE
X_combined = df_combined.drop('Severity', axis=1)
y_combined = df_combined['Severity']

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

# Normalization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_normal = scaler.fit_transform(X_resampled)

X_val_normal = scaler.transform(X_val)

X_test_normal = scaler.transform(X_test)

# Anomaly Detection

In [None]:
from sklearn.ensemble import IsolationForest

# Initialize the Isolation Forest model
model = IsolationForest(contamination='auto', random_state=42)

# Fit the model on the normalized resampled training data
model.fit(X_normal)

# Predict anomalies on the normalized training data
anomalies_predictions = model.predict(X_normal)

# Convert to a DataFrame
X_normal_df = pd.DataFrame(X_normal, columns=X_combined.columns)

# Add the anomaly predictions to the DataFrame
X_normal_df['anomaly'] = anomalies_predictions

# Extract anomalies
anomalies = X_normal_df[X_normal_df['anomaly'] == -1]

# Display the detected anomalies
print("Detected anomalies:")
print(anomalies)