## Imports

In [1]:
# DO NOT CHANGE THESE LINES.
import os
import pandas as pd
import json
import warnings
from joblib import load
warnings.filterwarnings('ignore')

## Paths

In [2]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
OUTPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "outputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')
PREDICTIONS_DIR = os.path.join(OUTPUT_DIR, 'predictions')
PREDICTIONS_FILE = os.path.join(PREDICTIONS_DIR, 'predictions.csv')
if not os.path.exists(PREDICTIONS_DIR):
    os.makedirs(PREDICTIONS_DIR)

### Reading the schema

In [3]:
file_name = os.listdir(INPUT_SCHEMA_DIR)[0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schmea = json.load(file)
features = schmea['features']

numeric_features = []
categorical_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])

id_feature = schmea['id']['name']
target_feature = schmea['target']['name']

### Reading test data.

In [4]:
file_name = os.listdir(TEST_DIR)[0]
file_path = os.path.join(TEST_DIR, file_name)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,u_id,fatals,a_ct,a_ped_f,a_pedal_f,a_roll,a_hr,a_polpur,month,day,...,a_body,owner,deaths,numoccs,impact1,deformed,ve_forms,ve_total,weather,lgt_cond
0,41633,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,10,7,...,"Motorcycles, Mopeds, All-Terrain Vehicles; All...",Driver (in this crash) Was Registered Owner,1,1.0,,Minor damage,1,3,Clear,Dark - not lighted
1,38966,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,6,9,...,Automobiles,Driver (in this crash) Was Registered Owner,1,1.0,,Disabling damage,1,1,Clear,Dark - lighted
2,24020,1,Single-Vehicle Crash,Pedestrian Fatality Involved Crash,Other Crash,Other Crash,Yes - Hit and Run,Other Crash,10,15,...,Van-Based Light Trucks,Driver (in this crash) Not Registered Owner (o...,0,1.0,Clockpoint 1,Minor damage,1,1,Clear,Dark - not lighted
3,52280,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,2,14,...,"Motorcycles, Mopeds, All-Terrain Vehicles; All...",Driver (in this crash) Not Registered Owner (o...,1,1.0,,Disabling damage,1,1,Clear,Dark - not lighted
4,47480,1,Single-Vehicle Crash,Other Crash,Pedalcyclist Fatality Involved Crash,Other Crash,No - Hit and Run,Other Crash,5,2,...,Utility Vehicles,Driver (in this crash) Was Registered Owner,0,1.0,Clockpoint 12,,1,1,Clear,Dark - lighted


## Data preprocessing
Note that when we work with testing data, we have to impute using the same values learned during training. This is to avoid data leakage.

In [5]:
columns_with_missing_values = df.columns[df.isna().any()]
imputaion_values = load(IMPUTATION_FILE)
for column in columns_with_missing_values:
    df[column].fillna(imputaion_values[column], inplace=True)


### Encoding
We encode the data using the same encoder that we saved during training.

In [6]:
# Saving the id column in a different variable.
ids = df[id_feature]

# Dropping the id from the dataframe
df.drop(columns=[id_feature], inplace=True)

# Encoding the rest of the features
encoder = load(OHE_ENCODER_FILE)
encoder.fit(df)
df = encoder.transform(df)


### Making predictions
Using the model saved during training. Notice that the model outputs a 2D array with many rows and 3 columns. </br>
Each row in the array represents an answer to a sample in the test data. Each number of the 3 numbers in the row is a probability to one of the 3 classes in the original problem.

In [7]:
model = load(PREDICTOR_FILE_PATH)
predictions = model.predict_proba(df)

predictions

array([[0.25836751, 0.58912321, 0.15250928],
       [0.46768429, 0.38508228, 0.14723343],
       [0.01852543, 0.96843632, 0.01303825],
       ...,
       [0.05026168, 0.90245021, 0.04728812],
       [0.10367372, 0.81905081, 0.07727547],
       [0.39319372, 0.39236345, 0.21444283]])

### Getting the original labels.
To get the original labels back, we use the same encoder from the training phase. <br><br>
Instead of calling the transform() function, this time we use inverse_transform().<br><br>

This will convert the labels [ 1, 2, 3 ] to the original labels [ drunk_driver_involved, other,	speeding_driver_involved ] 

In [8]:
encoder = load(LABEL_ENCODER_FILE)

class_names = encoder.inverse_transform([0, 1, 2])

predictions = pd.DataFrame(predictions, columns=class_names)
predictions.insert(0, 'u_id', ids)
predictions.to_csv(PREDICTIONS_FILE)
predictions


Unnamed: 0,u_id,drunk_driver_involved,other,speeding_driver_involved
0,41633,0.258368,0.589123,0.152509
1,38966,0.467684,0.385082,0.147233
2,24020,0.018525,0.968436,0.013038
3,52280,0.394018,0.427437,0.178545
4,47480,0.205881,0.598951,0.195169
...,...,...,...,...
11317,45653,0.048493,0.912324,0.039183
11318,46252,0.448486,0.250678,0.300836
11319,38231,0.050262,0.902450,0.047288
11320,6491,0.103674,0.819051,0.077275
