#### 1. Load reqruired libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

import wandb
import params

from feature_engine.encoding import OrdinalEncoder
from sklearn.model_selection import StratifiedShuffleSplit

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.pandas.set_option('display.max_columns', None)

#### 2. Load dataset

In [2]:
raw_rta_data = pd.read_csv('data/RTA Dataset.csv')
rta_data = pd.read_csv('data/RTA Dataset Transformed.csv')
print(rta_data.shape)
rta_data.head()

(12316, 32)


Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,Area_accident_occured,Lanes_or_Medians,Road_allignment,Types_of_Junction,Road_surface_type,Road_surface_conditions,Light_conditions,Weather_conditions,Type_of_collision,Number_of_vehicles_involved,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity,Hour
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,Slight Injury,17
1,0,1,0,1,0,1,1,0,1,0,1,1,0,0,0,0,0,0,1,2,2,0,0,0,0,0,0,0,0,1,Slight Injury,17
2,0,0,0,1,0,0,2,0,2,0,2,2,1,0,0,0,0,0,2,2,2,0,1,1,1,1,1,0,0,2,Serious Injury,17
3,1,0,0,1,0,2,1,1,2,0,1,2,2,1,1,0,1,0,1,2,2,0,2,2,2,1,1,1,0,3,Slight Injury,1
4,1,0,0,1,0,3,3,0,1,0,3,2,0,1,0,0,1,0,1,2,2,0,0,0,0,0,0,0,0,1,Slight Injury,1


In [3]:
# WANDB RUN
run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="training")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkrishnatasya[0m ([33mblack-order[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# CREATE AN ARTIFACT
raw_data_at = wandb.Artifact(params.RAW_DATA_AT, type='raw_data')
raw_data_at.add(wandb.Table(dataframe=raw_rta_data), 'raw_data')

transformed_data_at = wandb.Artifact(params.TRANSFORMED_DATA_AT, type='transformed_data')
transformed_data_at.add(wandb.Table(dataframe=rta_data), 'transformed_data')

ArtifactManifestEntry(path='transformed_data.table.json', digest='V7euZRuWFwhqRh1vx3MjJw==', ref=None, birth_artifact_id=None, size=1402448, extra={}, local_path='C:\\Users\\krish\\AppData\\Local\\wandb\\wandb\\artifacts\\staging\\tmpyhzroqhu')

#### 3. Encoding the target variable

In [5]:
target_encoder = OrdinalEncoder(encoding_method='arbitrary', variables='Accident_severity')
rta_data = target_encoder.fit_transform(rta_data)

X = rta_data.drop('Accident_severity', axis=1).values
y = rta_data['Accident_severity'].values

#### 4. Splitting the dataset into train, valid and test sets

In [6]:
# Initialize the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split for train and temp (which will be further divided into validation and test)
for train_index, temp_index in sss.split(X, y):
    X_train, X_temp = X[train_index], X[temp_index]
    y_train, y_temp = y[train_index], y[temp_index]

# Now split the temp data into validation and test sets
sss_valid_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for valid_index, test_index in sss_valid_test.split(X_temp, y_temp):
    X_valid, X_test = X_temp[valid_index], X_temp[test_index]
    y_valid, y_test = y_temp[valid_index], y_temp[test_index]


print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_valid: {X_valid.shape}, y_valid: {y_valid.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (9852, 31), y_train: (9852,)
X_valid: (1232, 31), y_valid: (1232,)
X_test: (1232, 31), y_test: (1232,)


#### 5. Model Building

##### 5.1 Decision Tree Classifier

In [7]:
decision_tree = DecisionTreeClassifier(random_state=2022, max_depth=2)
decision_tree = decision_tree.fit(X_train, y_train)
decision_tree_predictions = decision_tree.predict(X_valid)

decision_tree_report = classification_report(y_valid, decision_tree_predictions, output_dict=True)

print(f"Accuracy: {accuracy_score(y_valid, decision_tree_predictions)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_valid, decision_tree_predictions)}")

Accuracy: 0.8457792207792207
Confusion Matrix: 
[[1039    3    0]
 [ 171    3    0]
 [  16    0    0]]


In [8]:
decision_tree_predictions_test = decision_tree.predict(X_test)
decision_tree_report_test = classification_report(y_test, decision_tree_predictions_test, output_dict=True)

print(f"Accuracy: {accuracy_score(y_test, decision_tree_predictions_test)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, decision_tree_predictions_test)}")

Accuracy: 0.849025974025974
Confusion Matrix: 
[[1042    0    0]
 [ 171    4    0]
 [  15    0    0]]


##### 5.2 Random Forest Classifier

In [9]:
random_forest = RandomForestClassifier(random_state=2022, max_depth=2)
random_forest = random_forest.fit(X_train, y_train)
random_forest_predictions = random_forest.predict(X_valid)

random_forest_report = classification_report(y_valid, random_forest_predictions, output_dict=True)

print(f"Accuracy: {accuracy_score(y_valid, random_forest_predictions)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_valid, random_forest_predictions)}")

Accuracy: 0.8457792207792207
Confusion Matrix: 
[[1042    0    0]
 [ 174    0    0]
 [  16    0    0]]


In [10]:
random_forest_predictions_test = random_forest.predict(X_test)
random_forest_report_test = classification_report(y_test, random_forest_predictions_test, output_dict=True)

print(f"Accuracy: {accuracy_score(y_test, random_forest_predictions_test)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, random_forest_predictions_test)}")

Accuracy: 0.8457792207792207
Confusion Matrix: 
[[1042    0    0]
 [ 175    0    0]
 [  15    0    0]]


##### 5.3 XGBoost Classifier

In [11]:
xgboost = xgb.XGBClassifier(random_state=2022, max_depth=2)
xgboost = xgboost.fit(X_train, y_train)
xgboost_predictions = xgboost.predict(X_valid)

xgboost_report = classification_report(y_valid, xgboost_predictions, output_dict=True)

print(f"Accuracy: {accuracy_score(y_valid, xgboost_predictions)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_valid, xgboost_predictions)}")

Accuracy: 0.8474025974025974
Confusion Matrix: 
[[1037    5    0]
 [ 167    7    0]
 [  16    0    0]]


In [12]:
xgboost_predictions_test = xgboost.predict(X_test)
xgboost_report_test = classification_report(y_test, xgboost_predictions_test, output_dict=True)

print(f"Accuracy: {accuracy_score(y_test, xgboost_predictions_test)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, xgboost_predictions_test)}")

Accuracy: 0.851461038961039
Confusion Matrix: 
[[1042    0    0]
 [ 168    7    0]
 [  15    0    0]]


In [13]:
# Log the artifacts
run.log_artifact(raw_data_at)
run.log_artifact(transformed_data_at)

# Log the metrics
run.log({"decision_tree_accuracy": decision_tree_report['accuracy'],
            "decision_tree_precision": decision_tree_report['macro avg']['precision'],
            "decision_tree_recall": decision_tree_report['macro avg']['recall'],
            "decision_tree_f1": decision_tree_report['macro avg']['f1-score'],
            "random_forest_accuracy": random_forest_report['accuracy'],
            "random_forest_precision": random_forest_report['macro avg']['precision'],
            "random_forest_recall": random_forest_report['macro avg']['recall'],
            "random_forest_f1": random_forest_report['macro avg']['f1-score'],
            "xgboost_accuracy": xgboost_report['accuracy'],
            "xgboost_precision": xgboost_report['macro avg']['precision'],
            "xgboost_recall": xgboost_report['macro avg']['recall'],
            "xgboost_f1": xgboost_report['macro avg']['f1-score']
})

run.log({"decision_tree_accuracy_test": decision_tree_report_test['accuracy'],
            "decision_tree_precision_test": decision_tree_report_test['macro avg']['precision'],
            "decision_tree_recall_test": decision_tree_report_test['macro avg']['recall'],
            "decision_tree_f1_test": decision_tree_report_test['macro avg']['f1-score'],
            "random_forest_accuracy_test": random_forest_report_test['accuracy'],
            "random_forest_precision_test": random_forest_report_test['macro avg']['precision'],
            "random_forest_recall_test": random_forest_report_test['macro avg']['recall'],
            "random_forest_f1_test": random_forest_report_test['macro avg']['f1-score'],
            "xgboost_accuracy_test": xgboost_report_test['accuracy'],
            "xgboost_precision_test": xgboost_report_test['macro avg']['precision'],
            "xgboost_recall_test": xgboost_report_test['macro avg']['recall'],
            "xgboost_f1_test": xgboost_report_test['macro avg']['f1-score']
})

run.finish()

0,1
decision_tree_accuracy,▁
decision_tree_accuracy_test,▁
decision_tree_f1,▁
decision_tree_f1_test,▁
decision_tree_precision,▁
decision_tree_precision_test,▁
decision_tree_recall,▁
decision_tree_recall_test,▁
random_forest_accuracy,▁
random_forest_accuracy_test,▁

0,1
decision_tree_accuracy,0.84578
decision_tree_accuracy_test,0.84903
decision_tree_f1,0.31652
decision_tree_f1_test,0.32092
decision_tree_precision,0.44916
decision_tree_precision_test,0.61618
decision_tree_recall,0.33812
decision_tree_recall_test,0.34095
random_forest_accuracy,0.84578
random_forest_accuracy_test,0.84578
