In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [24]:
df=pd.read_csv('train.csv')

In [25]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
301,302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q
416,417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34.0,1,1,28220,32.5,,S
849,850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C
708,709,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S


In [26]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [27]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
27,0,1,male,19.0,3,2,263.0,S
244,0,3,male,30.0,0,0,7.225,C
764,0,3,male,16.0,0,0,7.775,S
831,1,2,male,0.83,1,1,18.75,S
811,0,3,male,39.0,0,0,24.15,S


In [28]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [29]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [30]:
X=df.drop(columns='Survived')
y=df['Survived']

In [31]:
X.shape,y.shape

((891, 7), (891,))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
X_train.sample(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
770,3,male,24.0,0,0,9.5,S
516,2,female,34.0,0,0,10.5,S
383,1,female,35.0,1,0,52.0,S


In [34]:
#missing values
tf1=ColumnTransformer(
    transformers=[
        ('impute_age',SimpleImputer(),[2]),
        ('imputer_embarked',SimpleImputer(strategy='most_frequent'),[6])
    ],
    remainder='passthrough'
)

In [35]:
#encoding
tf2=ColumnTransformer(
    transformers=[
        ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
    ],
    remainder='passthrough'
)


In [36]:
#scaling
tf3=ColumnTransformer(
    transformers=[
        ('scale',MinMaxScaler(),slice(0,10))
    ],
)

In [37]:
#feature selection
tf4=SelectKBest(score_func=chi2,k=8)

In [38]:
#model training
tf5=DecisionTreeClassifier()

# Create Pipeline

In [39]:
pipe=Pipeline(
    [
        ('tf1',tf1),
        ('tf2',tf2),
        ('tf3',tf3),
        ('tf4',tf4),
        ('tf5',tf5) 
    ]
)

In [40]:
# may use make_pipeline
#pipe=make_pipeline(tf1,tf2,tf3,tf4,tf5)
# same with make_columntansformer

In [41]:
#train
pipe.fit(X_train,y_train)

In [43]:
pipe.named_steps

{'tf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('imputer_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'tf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'tf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'tf4': SelectKBest(k=8, score_func=<function chi2 at 0x000001F67E251BC0>),
 'tf5': DecisionTreeClassifier()}

In [44]:
y_pred=pipe.predict(X_test)

In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6033898305084746

# exporting pipeline

In [46]:
import pickle
pickle.dump(pipe,open('model.pkl','wb'))