In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.ioff() # Turn off interactive mode

# data separation/transformation
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE # Recursive Feature Elimination¶

# modeling
from sklearn.cluster import KMeans
import sklearn.preprocessing

# statistics testing
import scipy.stats as stats

# system manipulation
from itertools import combinations
import os
import sys
sys.path.append("./util_")
import prepare_
import explore_
import classification_

# other
import math
import env
import warnings
warnings.filterwarnings("ignore")

# set the random seed
np.random.seed(95)

**Get data**

In [2]:
# This data is already been split and save
# This is only training data
train = pd.read_csv("./00_project_data/1-1_training_data.csv", index_col=0)
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,...,vehicle_type_code_5_trailer,vehicle_type_code_5_truck,vehicle_type_code_5_unknown,vehicle_type_code_5_van,borough_good_bronx,borough_good_brooklyn,borough_good_manhattan,borough_good_queens,borough_good_staten island,borough_good_unknown
0,0,0,0,0,0,0,0,0,vehicle failure,uncertain unspecified,...,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,unsafe driving,uncertain unspecified,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,driver,uncertain unspecified,...,0,0,1,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,0,visibility,driver experience,...,0,0,1,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,unsafe driving,uncertain unspecified,...,0,0,1,0,0,1,0,0,0,0


## Feature Selection

**Set features for selection**

In [3]:
# Feature columns
train_cols = train.iloc[:,22:].sample(1000)

# remove dommies which contain unknown or uncertain
pattern = "|".join(["unknown", "uncertain", "other"])
good_cols = train_cols.columns[~train_cols.columns.str.contains(pattern,case=False)]
train_cols = train_cols[good_cols]

# Separate quality column and wine_clr column from the x_features
feature_cols = train_cols.columns[~train_cols.columns.isin(["fatality", "crash_time", "crash_date"])]

feature_cols[:5]

Index(['contributing_factor_vehicle_1_driver experience',
       'contributing_factor_vehicle_1_impairment',
       'contributing_factor_vehicle_1_reactions',
       'contributing_factor_vehicle_1_unsafe driving',
       'contributing_factor_vehicle_1_vehicle failure'],
      dtype='object')

In [4]:
# separate features from target
x_features = train[feature_cols]
y_target = train["fatality"]
x_features.head(2)

Unnamed: 0,contributing_factor_vehicle_1_driver experience,contributing_factor_vehicle_1_impairment,contributing_factor_vehicle_1_reactions,contributing_factor_vehicle_1_unsafe driving,contributing_factor_vehicle_1_vehicle failure,contributing_factor_vehicle_1_visibility,contributing_factor_vehicle_2_distractions,contributing_factor_vehicle_2_driver,contributing_factor_vehicle_2_driver experience,contributing_factor_vehicle_2_impairment,...,vehicle_type_code_5_motorcycle,vehicle_type_code_5_scooter,vehicle_type_code_5_trailer,vehicle_type_code_5_truck,vehicle_type_code_5_van,borough_good_bronx,borough_good_brooklyn,borough_good_manhattan,borough_good_queens,borough_good_staten island
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### RFE

Use recursive feature elimination to select the top 30 features for predicting wine quality.

In [5]:
# make a model object to use in RFE process.
linear_model = LinearRegression()

# MAKE the RFE object
rfe = RFE(linear_model, n_features_to_select=1)

# FIT the RFE object to the training data
rfe.fit(x_features, y_target)

# odered from most important to least important
rfe.ranking_

array([69, 40, 48, 58, 72, 56, 53, 62, 68, 47, 74, 64, 61, 70, 19, 51, 46,
       39, 50, 44, 63, 43, 18, 60, 26, 41, 24, 28, 27, 29, 16, 13, 14,  9,
        4, 17, 11, 12, 49, 73, 20, 21, 55, 54, 75, 33, 32, 71, 76, 31, 35,
       34, 42, 52,  7,  3, 25, 23, 45,  8, 59, 38,  2, 22, 37, 30, 36, 10,
        1,  5, 78,  6, 15, 65, 67, 57, 66, 77])

In [6]:
# get a dataframe of the top 5 columns orderd by importance
x_feature_selected = x_features.iloc[:, rfe.ranking_[0:20] - 1] # -1 beacuse rfe rank starts at 1
x_feature_selected.head(3)

# More important to least important

Unnamed: 0,vehicle_type_code_5_motorcycle,vehicle_type_code_1_car,vehicle_type_code_2_motorcycle,vehicle_type_code_3_truck,vehicle_type_code_5_truck,vehicle_type_code_3_scooter,vehicle_type_code_3_bus,vehicle_type_code_4_motorcycle,vehicle_type_code_5_car,vehicle_type_code_2_car,borough_good_bronx,vehicle_type_code_4_trailer,vehicle_type_code_4_car,vehicle_type_code_5_scooter,contributing_factor_vehicle_3_reactions,vehicle_type_code_2_truck,vehicle_type_code_2_bus,vehicle_type_code_1_bus,vehicle_type_code_2_trailer,vehicle_type_code_1_truck
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


## Modeling

#### KNN

In [7]:
teat = x_feature_selected.sample(1000)
tar = train.fatality.sample(1000)

In [14]:
plt.ioff() # Turn off interactive mode
classification_.get_test_best_model_(best_model=None, xtrain=teat, xtest=teat, ytrain=tar,ytest=tar, k_depth_tree_c_=3)


'You need to pass a (k, depth, tree, or c) for your best performing model'