In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [2]:
ship=pd.read_csv('E:/May 30 DecisionTree/titanic_data.csv')

In [3]:
ship.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
ship.shape

(891, 12)

In [5]:
ship.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
outcomes = ship['Survived']
features_raw = ship.drop('Survived', axis = 1)

In [7]:
ship.drop('Name', axis = 1, inplace=True)

In [8]:
ship['Age'].value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
30.00    25
         ..
55.50     1
70.50     1
66.00     1
23.50     1
0.42      1
Name: Age, Length: 88, dtype: int64

In [9]:
ship['Age'] = ship['Age'].fillna(np.mean(ship['Age']))

In [10]:
ship['Age'].isnull().sum()

0

In [11]:
ship['Cabin'].value_counts()

C23 C25 C27    4
G6             4
B96 B98        4
C22 C26        3
F33            3
              ..
C101           1
A23            1
B101           1
D56            1
B3             1
Name: Cabin, Length: 147, dtype: int64

In [12]:
ship.drop('Cabin', axis=1, inplace=True)

In [13]:
ship['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [14]:
ship['Embarked'] = ship['Embarked'].fillna('S')

In [15]:
ship.drop('Ticket', axis=1, inplace=True)

In [16]:
ship.drop('PassengerId', axis=1, inplace=True)

In [17]:
ship.drop('SibSp', axis=1, inplace=True)

In [18]:
ship.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked
0,0,3,male,22.0,0,7.25,S
1,1,1,female,38.0,0,71.2833,C


In [19]:
ship.drop('Sex', axis=1, inplace=True)

In [20]:
X = ship.iloc[:, 1:,]

In [21]:
X

Unnamed: 0,Pclass,Age,Parch,Fare,Embarked
0,3,22.000000,0,7.2500,S
1,1,38.000000,0,71.2833,C
2,3,26.000000,0,7.9250,S
3,1,35.000000,0,53.1000,S
4,3,35.000000,0,8.0500,S
...,...,...,...,...,...
886,2,27.000000,0,13.0000,S
887,1,19.000000,0,30.0000,S
888,3,29.699118,2,23.4500,S
889,1,26.000000,0,30.0000,C


In [22]:
def convert_to_int(word):
    word_dict = {'S':0, 'C':1, 'Q':2}
    return word_dict[word]

In [23]:
X['Embarked'] = X['Embarked'].apply(lambda X : convert_to_int(X))

In [24]:
#['Pclass', 'Sex', 'Age', 'Parch', 'Fare', 'Embarked']]
Y = ship.iloc[:, 0]

In [25]:
Y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [27]:
pipeline_lr = Pipeline([('Scaler1', StandardScaler()),
                       ('lr_classifier', LogisticRegression(random_state=0))])

In [28]:
pipeline_sv = Pipeline([('Scaler2', StandardScaler()),
                        ('sv_classifier', SVC())])

In [29]:
pipeline_dt = Pipeline([('Scaler3', StandardScaler()), 
                       ('dt_classifier', DecisionTreeClassifier())])

In [30]:
pipelines = [pipeline_lr, pipeline_sv, pipeline_dt]

In [31]:
best_accuracy = 0.0
best_classifier = 0
best_pipeline = " "

In [32]:
pipeline_dictionary = {0:'LOGISTIC REGRESSION', 1:'SUPPORT VECTOR MACHINE', 2:'DECISION TREE CLASSIFIER'}

In [33]:
for pipe in pipelines:
    pipe.fit(X_train, Y_train)



In [35]:
for i,model in enumerate(pipelines):
    print(f"{pipeline_dictionary[i]} Test Accuracy is {model.score(X_test,Y_test)}")

LOGISTIC REGRESSION Test Accuracy is 0.7430167597765364
SUPPORT VECTOR MACHINE Test Accuracy is 0.7150837988826816
DECISION TREE CLASSIFIER Test Accuracy is 0.6703910614525139


In [36]:
for i,model in enumerate(pipelines):
    if model.score(X_test, Y_test)>best_accuracy:
        best_accuracy = model.score(X_test,Y_test)
        best_pipeline = model
        best_classifier = i

print(f"Classifier with best accuracy is {pipeline_dictionary[best_classifier]}")

Classifier with best accuracy is LOGISTIC REGRESSION
