## Imports

In [140]:
# DO NOT CHANGE THESE LINES.
import os
import json
import pandas as pd
import warnings
from sklearn.linear_model import LogisticRegression
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from joblib import dump
warnings.filterwarnings('ignore')

## Paths

In [141]:
# DO NOT CHANGE THESE LINES.
ROOT_DIR = os.path.dirname(os.getcwd())
MODEL_INPUTS_OUTPUTS = os.path.join(ROOT_DIR, 'model_inputs_outputs/')
INPUT_DIR = os.path.join(MODEL_INPUTS_OUTPUTS, "inputs")
INPUT_SCHEMA_DIR = os.path.join(INPUT_DIR, "schema")
DATA_DIR = os.path.join(INPUT_DIR, "data")
TRAIN_DIR = os.path.join(DATA_DIR, "training")
TEST_DIR = os.path.join(DATA_DIR, "testing")
MODEL_PATH = os.path.join(MODEL_INPUTS_OUTPUTS, "model")
MODEL_ARTIFACTS_PATH = os.path.join(MODEL_PATH, "artifacts")
OHE_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'ohe.joblib')
LABEL_ENCODER_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'label_encoder.joblib')
PREDICTOR_DIR_PATH = os.path.join(MODEL_ARTIFACTS_PATH, "predictor")
PREDICTOR_FILE_PATH = os.path.join(PREDICTOR_DIR_PATH, "predictor.joblib")
IMPUTATION_FILE = os.path.join(MODEL_ARTIFACTS_PATH, 'imputation.joblib')


### Reading the schema


In [142]:
file_name = os.listdir(INPUT_SCHEMA_DIR)[0]
schema_path = os.path.join(INPUT_SCHEMA_DIR, file_name)
with open(schema_path, "r", encoding="utf-8") as file:
    schmea = json.load(file)
features = schmea['features']

numeric_features = []
categorical_features = []
for f in features:
    if f['dataType'] == 'CATEGORICAL':
        categorical_features.append(f['name'])
    else:
        numeric_features.append(f['name'])

### Reading training data

In [143]:
file_name = os.listdir(TRAIN_DIR)[0]
file_path = os.path.join(TRAIN_DIR, file_name)
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,u_id,fatals,a_ct,a_ped_f,a_pedal_f,a_roll,a_hr,a_polpur,month,day,...,owner,deaths,numoccs,impact1,deformed,ve_forms,ve_total,weather,lgt_cond,driver_factor
0,32083,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,10,2,...,Driver (in this crash) Was Registered Owner,1,1.0,Clockpoint 12,Disabling damage,1,1,Clear,Dark - not lighted,other
1,55073,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,6,21,...,Driver (in this crash) Not Registered Owner (o...,1,1.0,Clockpoint 1,Disabling damage,1,1,Clear,Daylight,speeding_driver_involved
2,7458,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,7,14,...,Driver (in this crash) Was Registered Owner,0,1.0,Clockpoint 12,,1,1,Clear,Daylight,other
3,5685,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,9,15,...,Driver (in this crash) Not Registered Owner (o...,0,1.0,Clockpoint 12,Functional damage,1,1,Clear,Dark - not lighted,other
4,9245,1,Single-Vehicle Crash,Other Crash,Other Crash,Other Crash,No - Hit and Run,Other Crash,9,28,...,Driver (in this crash) Was Registered Owner,1,1.0,Clockpoint 9,Disabling damage,1,1,Clear,Dark - not lighted,drunk_driver_involved


## Data Preprocessing

##### Imputing missing data

In [144]:
# imputing missing data
columns_with_missing_values = df.columns[df.isna().any()]
imputaion_values = {}
for column in columns_with_missing_values:
    mode = df[column].mode()[0]
    df[column].fillna(mode, inplace=True)
    imputaion_values[column] = mode

path = dump(imputaion_values, IMPUTATION_FILE)

##### Encoding Categorical features
> Notice that we do not want to encode the target feature not the id column.

In [145]:
# Saving the u_id and driver_factor columns in a different variable.
ids = df['u_id']
target = df['driver_factor']

# Dropping the u_id and driver_factor from the dataframe
df.drop(columns=['u_id', 'driver_factor'], inplace=True)

for c in categorical_features:
    df[c] = df[c].astype(str)

# Encoding the rest of the features
encoder = OneHotEncoder(top_categories=6)
encoder.fit(df)
df = encoder.transform(df)

# Saving the encoder to use it on the testing dataset
path = dump(encoder, OHE_ENCODER_FILE)


#### Encoding the target feature

In [146]:
target.values

array(['other', 'speeding_driver_involved', 'other', ..., 'other',
       'other', 'drunk_driver_involved'], dtype=object)

In [147]:
encoder = LabelEncoder()
y = encoder.fit_transform(target.values.reshape(-1, 1))
dump(encoder, LABEL_ENCODER_FILE)
y

array([1, 2, 1, ..., 1, 1, 0])

In [148]:
df

Unnamed: 0,fatals,month,day,hour,minute,age,permvit,pernotmvit,mod_year,deaths,...,weather_Rain,"weather_Fog, smog, smoke",weather_Snow,weather_Reported as unknown,lgt_cond_Daylight,lgt_cond_Dark - not lighted,lgt_cond_Dark - lighted,lgt_cond_Dawn,lgt_cond_Dark - unknown lighting,lgt_cond_Dusk
0,1,10,2,3.0,10.0,62,1,0,2003.0,1,...,0,0,0,0,0,1,0,0,0,0
1,1,6,21,8.0,45.0,40,1,0,2002.0,1,...,0,0,0,0,1,0,0,0,0,0
2,1,7,14,21.0,45.0,26,1,1,2003.0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,9,15,20.0,46.0,64,1,1,1999.0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,9,28,20.0,24.0,45,1,0,1996.0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45281,1,3,12,20.0,42.0,24,2,1,2015.0,0,...,0,0,0,0,0,0,1,0,0,0
45282,1,5,15,4.0,53.0,18,1,0,1998.0,1,...,0,0,0,0,0,1,0,0,0,0
45283,1,12,5,6.0,15.0,43,1,0,2015.0,1,...,1,0,0,0,0,1,0,0,0,0
45284,1,5,12,23.0,51.0,21,1,0,2010.0,1,...,0,0,0,0,0,0,1,0,0,0


### Training the Classifier

In [149]:
# Creating a logistic regression model and training it
model = LogisticRegression()
model.fit(df, y)

# Saving the model to use it for predictions
path = dump(model, PREDICTOR_FILE_PATH)
