In [75]:
# Importing the dependence to read

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [40]:
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [71]:
data.shape

(418, 4)

In [41]:
# Removing unwanted Columns which won't yield meaningful analysis

data.drop(['PassengerId','Name','SibSp','Parch','Ticket','Fare','Cabin', 'Embarked'], axis = 'columns', inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,34.5
1,1,3,female,47.0
2,0,2,male,62.0
3,0,3,male,27.0
4,1,3,female,22.0


In [42]:
# Checking Null values

data.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
dtype: int64

In [43]:
#Filling missing null values

data['Age'] = data['Age'].fillna(data['Age'].median())
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,34.5
1,1,3,female,47.0
2,0,2,male,62.0
3,0,3,male,27.0
4,1,3,female,22.0


In [44]:
#Making Sure for null values

data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
dtype: int64

In [45]:
#Converting column age datatype from float 64 to int 64

data['Age'] = data['Age'].astype("int64")
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,34
1,1,3,female,47
2,0,2,male,62
3,0,3,male,27
4,1,3,female,22


In [46]:
# Implementing Apriori

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [50]:
titanic_encoded_df = pd.get_dummies(data, columns=["Age","Pclass", "Sex"])
titanic_encoded_df

Unnamed: 0,Survived,Age_0,Age_1,Age_2,Age_3,Age_5,Age_6,Age_7,Age_8,Age_9,...,Age_62,Age_63,Age_64,Age_67,Age_76,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
414,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [53]:
frequent_itemsets = apriori(titanic_encoded_df, min_support=0.05, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.363636,(Survived)
1,0.23445,(Age_27)
2,0.255981,(Pclass_1)
3,0.222488,(Pclass_2)
4,0.521531,(Pclass_3)
5,0.363636,(Sex_female)
6,0.636364,(Sex_male)
7,0.069378,"(Survived, Age_27)"
8,0.119617,"(Survived, Pclass_1)"
9,0.07177,"(Survived, Pclass_2)"


In [54]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Survived),(Pclass_1),0.363636,0.255981,0.119617,0.328947,1.285047,0.026533,1.108734,0.348571
1,(Pclass_1),(Survived),0.255981,0.363636,0.119617,0.46729,1.285047,0.026533,1.194577,0.298135
2,(Survived),(Sex_female),0.363636,0.363636,0.363636,1.0,2.75,0.231405,inf,1.0
3,(Sex_female),(Survived),0.363636,0.363636,0.363636,1.0,2.75,0.231405,inf,1.0
4,(Age_27),(Pclass_3),0.23445,0.521531,0.188995,0.806122,1.545684,0.066722,2.467892,0.461155
5,(Pclass_3),(Age_27),0.521531,0.23445,0.188995,0.362385,1.545684,0.066722,1.200647,0.737848
6,(Sex_male),(Age_27),0.636364,0.23445,0.165072,0.259398,1.106414,0.015876,1.033687,0.264493
7,(Age_27),(Sex_male),0.23445,0.636364,0.165072,0.704082,1.106414,0.015876,1.22884,0.125634
8,(Pclass_1),(Sex_female),0.255981,0.363636,0.119617,0.46729,1.285047,0.026533,1.194577,0.298135
9,(Sex_female),(Pclass_1),0.363636,0.255981,0.119617,0.328947,1.285047,0.026533,1.108734,0.348571


In [55]:
#Making two datasets

X = data.drop(['Survived'], axis='columns')
Y = data.Survived

In [56]:
# Encoding gender

encode_gender = pd.get_dummies(X.Sex)
encode_gender.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,0,1
3,0,1
4,1,0


In [57]:
# Concatenating it in main Data File

X = pd.concat([X,encode_gender], axis='columns')

In [58]:
X.head()

Unnamed: 0,Pclass,Sex,Age,female,male
0,3,male,34,0,1
1,3,female,47,1,0
2,2,male,62,0,1
3,3,male,27,0,1
4,3,female,22,1,0


In [59]:
#Dropping sex columns

X.drop(['Sex'], axis='columns', inplace=True)
X

Unnamed: 0,Pclass,Age,female,male
0,3,34,0,1
1,3,47,1,0
2,2,62,0,1
3,3,27,0,1
4,3,22,1,0
...,...,...,...,...
413,3,27,0,1
414,1,39,1,0
415,3,38,0,1
416,3,27,0,1


In [60]:
# Splitting the data

from sklearn.model_selection import train_test_split

In [61]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2)

In [62]:
# Using Naive Bayes Classification

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [65]:
model.fit(X_Train,Y_Train)

In [66]:
model.score(X_Test,Y_Test)

1.0

In [80]:
NB = model.predict(X_Test)
NB

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

In [81]:
print('Accuracy Score : ', accuracy_score(NB, Y_Test))
print('F1 Score       : ', f1_score(NB, Y_Test))

Accuracy Score :  1.0
F1 Score       :  1.0


In [82]:
print("Classification report matrix : \n", classification_report(NB, Y_Test))

Classification report matrix : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        49
           1       1.00      1.00      1.00        35

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [68]:
model.predict(X_Test[:10])

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [72]:
#Decesion Tree

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = "gini", random_state=0, max_depth=3)

In [73]:
dt.fit(X_Train,Y_Train)

In [84]:
DT = dt.predict(X_Test)
DT

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

In [85]:
print('Accuracy Score : ', accuracy_score(DT, Y_Test))
print('F1 Score       : ', f1_score(DT, Y_Test))

Accuracy Score :  1.0
F1 Score       :  1.0


In [86]:
print("Classification report matrix : \n", classification_report(DT, Y_Test))

Classification report matrix : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        49
           1       1.00      1.00      1.00        35

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [83]:
# K means Clustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [89]:
clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_Train,y= Y_Train)
KM = clf.predict(X_Test)
print("Accuracy is: ",accuracy_score(KM,Y_Test))

Accuracy is:  0.9642857142857143


In [90]:
print("Classification report matrix : \n", classification_report(KM, Y_Test))

Classification report matrix : 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97        48
           1       0.97      0.94      0.96        36

    accuracy                           0.96        84
   macro avg       0.97      0.96      0.96        84
weighted avg       0.96      0.96      0.96        84

