In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier

# Load the dataset# Load the dataset
file_path = "../dags/data/RTA_Dataset.csv"
df = pd.read_csv(file_path) 

In [8]:
# Drop all rows with missing values
df = df.dropna()

In [9]:
# Select relevant columns
columns = [
    'day_of_week', 'age_band_of_driver', 'type_of_vehicle',
    'area_accident_occured', 'lanes_or_medians',
    'types_of_junction', 'weather_conditions', 'accident_severity'
]
# Convert all column names to lowercase
df.columns = df.columns.str.lower()
df = df[columns].copy()

In [10]:
# 1. Clean 'day_of_week'
valid_days = ['monday', 'sunday', 'friday', 'wednesday', 'saturday', 'thursday', 'tuesday']
df['day_of_week'] = df['day_of_week'].str.lower()
df = df[df['day_of_week'].isin(valid_days)]

In [11]:
# 2. Clean 'age_band_of_driver'
df = df[df['age_band_of_driver'] != 'Under 18']
df['age_band_of_driver'] = df['age_band_of_driver'].replace({
    'Over 51': '>51',
    'Unknown': 'unknown'
})

In [12]:
# 3. Simplify 'type_of_vehicle'
car_types = ['Automobile', 'Taxi', 'Stationwagen']
lorry_types = ['Lorry (41?100Q)', 'Lorry (11?40Q)', 'Long lorry', 'Pick up upto 10Q']
bus_types = ['Public (> 45 seats)', 'Public (12 seats)', 'Public (13?45 seats)']
motorcycle_types = ['Motorcycle', 'Bajaj', 'Motorcycle (below 400cc)']
other_types =['Ridden horse','Other','Special vehicle','Turbo','Bicycle']

def simplify_vehicle_type(v):
    if v in car_types:
        return 'car'
    elif v in lorry_types:
        return 'lorry'
    elif v in bus_types:
        return 'bus'
    elif v in motorcycle_types:
        return 'motorcycle'
    elif v in other_types:
        return 'other'
    else:
        return 'other'
df['type_of_vehicle'] = df['type_of_vehicle'].apply(simplify_vehicle_type)

In [13]:
# transform categorical features
types_of_junction_to_types_of_junction = {
    "Y Shape":"y_shape",
    "No junction":"no_junction",
    "Crossing":  "crossing",
    "Other":  "other",
    "Unknown":  "unknown",
    "O Shape":"o_shape",
    "T Shape":"t_shape",
    "X Shape":"x_shape",
}
df["types_of_junction"] = df["types_of_junction"].map(types_of_junction_to_types_of_junction)

In [14]:
# transform 'area_accident_occured'
area_to_highway = {
    "Other": "road",
    "Office areas": "service",
    "Residential areas": "residential",
    "Church areas": "service",
    "Industrial areas": "service",
    "School areas": "living_street",
    "Recreational areas": "living_street",
    "Outside rural areas": "unclassified",
    "Hospital areas": "service",
    "Market areas": "living_street",
    "Rural village areas": "tertiary",
    "Unknown": "road",
    "Rural village areasOffice areas": "road",  # Inconsistent value
    # Possible trimmed version to cover any leading/trailing whitespace
    "  Recreational areas": "living_street",
    "  Market areas": "living_street"
}
df["area_accident_occured"] = df["area_accident_occured"].dropna()
df["area_accident_occured"] = df["area_accident_occured"].map(area_to_highway)

In [15]:
# transform 'lanes_or_medians'
df["lanes_or_medians"] = df["lanes_or_medians"].apply(
    lambda x: "two_way" if 'two way' in x.lower() or 'two-way' in x.lower()
    else "one_way" if 'double carriageway' in x.lower()
    else x.strip().replace(" ", "_").lower() 
)

In [16]:
# transform 'weather_conditions'
df["weather_conditions"] = df["weather_conditions"].apply(
    lambda x: "rain" if "rain" in x.lower() else "no_rain"
)

In [17]:
# transfrom 'accident_severity' # target
df['accident_severity'] = df['accident_severity'].str.lower()
df['accident_severity'] = df['accident_severity'].map(lambda x: x.split(' ')[0] if ' ' in x else x)


In [18]:
df.describe(include='all')
df.info()
for col in columns:
    print(df[col].unique())  # Check unique values before encoding

<class 'pandas.core.frame.DataFrame'>
Index: 2689 entries, 8 to 12315
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   day_of_week            2689 non-null   object
 1   age_band_of_driver     2689 non-null   object
 2   type_of_vehicle        2689 non-null   object
 3   area_accident_occured  2195 non-null   object
 4   lanes_or_medians       2689 non-null   object
 5   types_of_junction      2689 non-null   object
 6   weather_conditions     2689 non-null   object
 7   accident_severity      2689 non-null   object
dtypes: object(8)
memory usage: 189.1+ KB
['friday' 'thursday' 'tuesday' 'wednesday' 'saturday' 'sunday' 'monday']
['18-30' '31-50' '>51' 'unknown']
['lorry' 'car' 'bus' 'other' 'motorcycle']
[nan 'residential' 'service' 'living_street' 'tertiary' 'road']
['other' 'two_way' 'one_way' 'unknown']
['y_shape' 'no_junction' 'crossing' 'o_shape' 'other' 'unknown' 't_shape'
 'x_shape']
['no_r

In [None]:
# Encode features and target
y = df['accident_severity']
X = df.drop('accident_severity', axis=1)

label_encoders = {}
for col in X.columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

le_target = LabelEncoder()
y = le_target.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)
# Initialize Balanced Random Forest
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
brf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = brf_model.predict(X_test)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

              precision    recall  f1-score   support

       fatal       0.03      0.25      0.05         4
     serious       0.15      0.29      0.20        62
      slight       0.89      0.73      0.80       472

    accuracy                           0.67       538
   macro avg       0.36      0.42      0.35       538
weighted avg       0.80      0.67      0.73       538

