In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("accident.csv")

In [5]:
df.sample(10)

Unnamed: 0,Age,Gender,Speed_of_Impact,Helmet_Used,Seatbelt_Used,Survived
109,41,Female,72.0,Yes,Yes,1
173,49,Male,58.0,Yes,No,1
138,18,Female,77.0,Yes,Yes,0
174,24,Female,119.0,Yes,Yes,0
48,21,Male,43.0,No,No,1
99,32,Female,105.0,No,No,0
123,26,Female,49.0,Yes,Yes,0
54,19,Male,46.0,No,Yes,0
6,38,Male,116.0,Yes,Yes,1
171,53,Male,67.0,Yes,No,0


In [6]:
df.shape

(200, 6)

In [7]:
df.isnull().sum()

Age                0
Gender             0
Speed_of_Impact    3
Helmet_Used        0
Seatbelt_Used      0
Survived           0
dtype: int64

In [14]:
X = df.drop("Survived", axis=1)

In [15]:
X

Unnamed: 0,Age,Gender,Speed_of_Impact,Helmet_Used,Seatbelt_Used
0,56,Female,27.0,No,No
1,69,Female,46.0,No,Yes
2,46,Male,46.0,Yes,Yes
3,32,Male,117.0,No,Yes
4,60,Female,40.0,Yes,Yes
...,...,...,...,...,...
195,69,Female,111.0,No,Yes
196,30,Female,51.0,No,Yes
197,58,Male,110.0,No,Yes
198,20,Male,103.0,No,Yes


In [20]:
y = df["Survived"]

In [21]:
y

0      1
1      1
2      0
3      0
4      0
      ..
195    1
196    1
197    1
198    1
199    1
Name: Survived, Length: 200, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
X_train.head()

Unnamed: 0,Age,Gender,Speed_of_Impact,Helmet_Used,Seatbelt_Used
79,53,Male,35.0,Yes,No
197,58,Male,110.0,No,Yes
38,61,Female,106.0,Yes,Yes
24,38,Female,25.0,Yes,Yes
122,24,Male,32.0,No,No


In [35]:
X_train.shape

(160, 5)

In [36]:
y_train

79     0
197    1
38     0
24     0
122    1
      ..
106    1
14     0
92     1
179    0
102    1
Name: Survived, Length: 160, dtype: int64

In [23]:
# imputation transformer
trf1 = ColumnTransformer([
    ('Speed_of_Impact',SimpleImputer(strategy='most_frequent'),[2])
],remainder='passthrough')

In [25]:
trf1

In [37]:
# one hot encoding
trf2 = ColumnTransformer([
    ('Helmet_Used',OneHotEncoder(handle_unknown='ignore'),[4]),
    ('Seatbelt_Used',OneHotEncoder(handle_unknown='ignore'),[5]),
    ('Gender',OneHotEncoder(handle_unknown='ignore'),[1])
],remainder='passthrough')

In [38]:
trf2

In [49]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,11))
])

In [50]:
trf3

In [53]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=10)

In [54]:
trf4

In [60]:
# train the model
trf5 = DecisionTreeClassifier()

In [61]:
trf5

In [62]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

## Pipeline Vs make_pipeline
### Pipeline requires naming of steps, make_pipeline does not.

In [65]:
# Alternate Syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [66]:
# train
pipe.fit(X_train,y_train)

ValueError: all features must be in [0, 4] or [-5, 0]