In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.pipeline import Pipeline,make_pipeline



In [2]:
df = pd.read_csv('titanic.csv')
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
df.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)

In [4]:
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,1:],
                                                 df.iloc[:,0:1]
                                                ,test_size=0.2,random_state=42)

In [18]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
RFR = RandomForestRegressor()
dict_ = {'n_estimators':[10,20,30,50,80],
          'criterion':['squared_error','absolute_error']}
grid = GridSearchCV(estimator=RFR,param_grid = dict_,cv=10,scoring='accuracy',n_jobs = -1)
grid.fit(X_train,y_train)

100 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 328, in fit
    X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 976, in check_X_y
    estimator

ValueError: could not convert string to float: 'male'

In [7]:
#Imputation Transformer
trf1 = ColumnTransformer(transformers = [
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [8]:
#one hot encoding
trf2 = ColumnTransformer(transformers = [
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [9]:
#Scaling
trf3 = ColumnTransformer(transformers = [
    ('scale',MinMaxScaler(),slice(0,10))
])

In [10]:
#Feature Selection
trf4 = SelectKBest(score_func=chi2,k=5)

In [11]:
# train the model
trf5 = DecisionTreeClassifier()

## Create Pipeline 

In [12]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5),
])

#List of Tuple, Transfomer Name & Object

######  Above code can also be written as pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)


##### If model is also training through pipeline then call fit function. If model is not 
calling through pipeline then call fit_transform function.

In [13]:
from sklearn import set_config
set_config(display='diagram')

In [14]:
pipe.fit(X_train,y_train)

In [15]:
y_pred = pipe.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

### Explore Code

In [22]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=5, score_func=<function chi2 at 0x00000268B4BBEB70>),
 'trf5': DecisionTreeClassifier()}

In [23]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.49884615])

In [24]:
pipe.named_steps['trf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

###  Cross Validation using Pipeline

In [25]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

0.6391214419383433

In [26]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=10,scoring='accuracy').mean()

0.6391431924882629

### GridSearch using Pipeline

In [27]:
# Gridsearch CV
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [28]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [32]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [29]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=10,scoring='accuracy')
grid.fit(X_train,y_train)

In [30]:
grid.best_score_

0.6391431924882629

In [50]:
grid.best_params_

{'trf5__max_depth': 1}

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
RFR = RandomForestRegressor()
dict_ = {'n_estimators':[10,20,30,50,80],
          'criterion':['squared_error','absolute_error']}
grid = GridSearchCV(estimator=RFR,param_grid = dict_,cv=10,scoring='accuracy',n_jobs = -1)
grid.fit(X_train,y_train)

100 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 328, in fit
    X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\dell\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 976, in check_X_y
    estimator

ValueError: could not convert string to float: 'male'

### Export the Pipeline

In [78]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

#### Now lets pick model and predict values. It is so easy to pick model if created through pipeline and predict values

In [79]:
import pickle
import numpy as np

In [80]:
pipe = pickle.load(open('pipe.pkl','rb'))

In [81]:
test_input = np.array([2,'male',31,0,0,10.5,'S'],dtype=object).reshape(1,7)

In [82]:
pipe.predict(test_input)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


array([0], dtype=int64)