In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# data separation/transformation
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE # Recursive Feature Elimination¶

# modeling
from sklearn.cluster import KMeans
import sklearn.preprocessing

# statistics testing
import scipy.stats as stats

# system manipulation
from itertools import combinations
import os
import sys
sys.path.append("./util_")
import prepare_
import explore_

# other
import math
import env
import warnings
warnings.filterwarnings("ignore")

# set the random seed
np.random.seed(95)

**Get data**

In [2]:
# This data is already been split and save
# This is only training data
train = pd.read_csv("./00_project_data/1-1_training_data.csv", index_col=0)
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,...,vehicle_type_code_5_scooter,vehicle_type_code_5_trailer,vehicle_type_code_5_truck,vehicle_type_code_5_van,borough_good_bronx,borough_good_brooklyn,borough_good_manhattan,borough_good_queens,borough_good_staten island,borough_good_unknown
0,0,0,0,0,0,0,0,0,impairment,uncertain unspecified,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,driver,uncertain unspecified,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,uncertain unspecified,uncertain unspecified,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,driver,driver,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,uncertain unspecified,uncertain unspecified,...,0,0,0,0,0,1,0,0,0,0


## Feature Selection

**Set features for selection**

In [5]:
# Feature columns
train_cols = train.iloc[:,22:].sample(1000)

# remove dommies which contain unknown or uncertain
pattern = "|".join(["unknown", "uncertain"])
good_cols = train_cols.columns[~train_cols.columns.str.contains(pattern,case=False)]
train_cols = train_cols[good_cols]

# Separate quality column and wine_clr column from the x_features
feature_cols = train_cols.columns[~train_cols.columns.isin(["fatality", "crash_time", "crash_date"])]

feature_cols[:5]

Index(['contributing_factor_vehicle_1_driver experience',
       'contributing_factor_vehicle_1_impairment',
       'contributing_factor_vehicle_1_other',
       'contributing_factor_vehicle_1_reactions',
       'contributing_factor_vehicle_1_unsafe driving'],
      dtype='object')

In [6]:
# separate features from target
x_features = train[feature_cols]
y_target = train["fatality"]
x_features.head(2)

Unnamed: 0,contributing_factor_vehicle_1_driver experience,contributing_factor_vehicle_1_impairment,contributing_factor_vehicle_1_other,contributing_factor_vehicle_1_reactions,contributing_factor_vehicle_1_unsafe driving,contributing_factor_vehicle_1_vehicle failure,contributing_factor_vehicle_1_visibility,contributing_factor_vehicle_2_distractions,contributing_factor_vehicle_2_driver,contributing_factor_vehicle_2_driver experience,...,vehicle_type_code_5_other.1,vehicle_type_code_5_scooter,vehicle_type_code_5_trailer,vehicle_type_code_5_truck,vehicle_type_code_5_van,borough_good_bronx,borough_good_brooklyn,borough_good_manhattan,borough_good_queens,borough_good_staten island
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


### RFE

Use recursive feature elimination to select the top 30 features for predicting wine quality.

In [7]:
# make a model object to use in RFE process.
linear_model = LinearRegression()

# MAKE the RFE object
rfe = RFE(linear_model, n_features_to_select=1)

# FIT the RFE object to the training data
rfe.fit(x_features, y_target)

# odered from most important to least important
rfe.ranking_

array([71, 66, 65, 72, 83, 91, 76, 81, 85, 88, 73, 68, 90, 87, 84, 78, 55,
       82, 62, 64, 89, 67, 60, 80, 61, 54, 70, 69, 56, 59, 63, 57, 58, 52,
       50, 51, 48, 46, 53, 47, 49, 41, 42, 38, 45, 43, 44, 37, 40, 39, 33,
       32, 29, 35, 34, 36, 28, 31, 30, 25, 23, 20, 26, 21, 27, 19, 24, 22,
       12, 11, 14, 10, 15, 17, 18, 16, 13,  3,  4,  7,  9,  5,  2,  1,  8,
        6, 75, 77, 74, 79, 86])

In [8]:
# get a dataframe of the top 5 columns orderd by importance
x_feature_selected = x_features.iloc[:, rfe.ranking_[0:20] - 1] # -1 beacuse rfe rank starts at 1
x_feature_selected.head(3)

# More important to least important

Unnamed: 0,vehicle_type_code_4_car,vehicle_type_code_3_trailer,vehicle_type_code_3_scooter,vehicle_type_code_4_motorcycle,vehicle_type_code_5_scooter,borough_good_staten island,vehicle_type_code_4_truck,vehicle_type_code_5_motorcycle,vehicle_type_code_5_truck,borough_good_brooklyn,vehicle_type_code_4_other.1,vehicle_type_code_3_van,borough_good_queens,borough_good_bronx,vehicle_type_code_5_trailer,vehicle_type_code_5_other,vehicle_type_code_2_other.1,vehicle_type_code_5_other.1,vehicle_type_code_3_car,vehicle_type_code_3_other.1
0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1
1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1
