In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
titanic_df = pd.read_excel('/content/titanic3.xls')

PassengerId: A unique identifier assigned to each passenger.

Survived: Indicates whether the passenger survived or not.

0 = Did not survive

1 = Survived

Pclass (Ticket Class): The class of the ticket the passenger purchased.

1 = First Class

2 = Second Class

3 = Third Class

Name: The name of the passenger.

Sex: The gender of the passenger (male or female).

Age: The age of the passenger. Note that this column may contain missing values.

SibSp (Siblings/Spouses Aboard): The number of siblings or spouses the passenger had aboard the Titanic.

Parch (Parents/Children Aboard): The number of parents or children the passenger had aboard the Titanic.

Ticket: The ticket number.

Fare: The amount of money paid for the ticket.

Cabin: The cabin number where the passenger stayed.

Embarked: The port where the passenger boarded the Titanic.

C = Cherbourg

Q = Queenstown

S = Southampton

In [5]:
titanic_df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [6]:
titanic_df['cabin'].value_counts()

C23 C25 C27        6
G6                 5
B57 B59 B63 B66    5
F4                 4
F33                4
                  ..
C132               1
E60                1
B52 B54 B56        1
C49                1
F38                1
Name: cabin, Length: 186, dtype: int64

In [7]:
titanic_df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [8]:
titanic_df['boat'].value_counts()

13         39
C          38
15         37
14         33
4          31
10         29
5          27
3          26
9          25
11         25
7          23
16         23
8          23
6          20
D          20
12         19
2          12
A          11
B           9
1           5
C D         2
13 15       2
5 7         2
8 10        1
13 15 B     1
5 9         1
15 16       1
2           1
Name: boat, dtype: int64

In [9]:
titanic_df['body'].value_counts()

135.0    1
101.0    1
37.0     1
285.0    1
156.0    1
        ..
97.0     1
174.0    1
169.0    1
245.0    1
304.0    1
Name: body, Length: 121, dtype: int64

In [10]:
titanic_df = titanic_df[['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch']]

In [11]:
titanic_df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch
0,1,1,female,29.0,0,0
1,1,1,male,0.9167,1,2
2,1,0,female,2.0,1,2
3,1,0,male,30.0,1,2
4,1,0,female,25.0,1,2


In [12]:
titanic_df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch
count,1309.0,1309.0,1046.0,1309.0,1309.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027
std,0.837836,0.486055,14.4135,1.041658,0.86556
min,1.0,0.0,0.1667,0.0,0.0
25%,2.0,0.0,21.0,0.0,0.0
50%,3.0,0.0,28.0,0.0,0.0
75%,3.0,1.0,39.0,1.0,0.0
max,3.0,1.0,80.0,8.0,9.0


In [13]:
titanic_df.describe(include='all')

Unnamed: 0,pclass,survived,sex,age,sibsp,parch
count,1309.0,1309.0,1309,1046.0,1309.0,1309.0
unique,,,2,,,
top,,,male,,,
freq,,,843,,,
mean,2.294882,0.381971,,29.881135,0.498854,0.385027
std,0.837836,0.486055,,14.4135,1.041658,0.86556
min,1.0,0.0,,0.1667,0.0,0.0
25%,2.0,0.0,,21.0,0.0,0.0
50%,3.0,0.0,,28.0,0.0,0.0
75%,3.0,1.0,,39.0,1.0,0.0


In [14]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   object 
 3   age       1046 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 61.5+ KB


In [None]:
pip install fancyimpute

In [16]:
from fancyimpute import IterativeImputer

In [17]:
imputer = IterativeImputer(max_iter=1000, random_state=42)

In [19]:
titanic_df['sex'] = titanic_df['sex'].map({'female':0, 'male':1})

In [20]:
titanic_df = pd.DataFrame(imputer.fit_transform(titanic_df), columns=titanic_df.columns)

In [21]:
titanic_df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch
0,1.0,1.0,0.0,29.00000,0.0,0.0
1,1.0,1.0,1.0,0.91670,1.0,2.0
2,1.0,0.0,0.0,2.00000,1.0,2.0
3,1.0,0.0,1.0,30.00000,1.0,2.0
4,1.0,0.0,0.0,25.00000,1.0,2.0
...,...,...,...,...,...,...
1304,3.0,0.0,0.0,14.50000,1.0,0.0
1305,3.0,0.0,0.0,24.83858,1.0,0.0
1306,3.0,0.0,1.0,26.50000,0.0,0.0
1307,3.0,0.0,1.0,27.00000,0.0,0.0


In [22]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   float64
 1   survived  1309 non-null   float64
 2   sex       1309 non-null   float64
 3   age       1309 non-null   float64
 4   sibsp     1309 non-null   float64
 5   parch     1309 non-null   float64
dtypes: float64(6)
memory usage: 61.5 KB


In [23]:
x = titanic_df[['pclass', 'sex', 'age', 'sibsp', 'parch']]

y = titanic_df['survived']

In [24]:
x

Unnamed: 0,pclass,sex,age,sibsp,parch
0,1.0,0.0,29.00000,0.0,0.0
1,1.0,1.0,0.91670,1.0,2.0
2,1.0,0.0,2.00000,1.0,2.0
3,1.0,1.0,30.00000,1.0,2.0
4,1.0,0.0,25.00000,1.0,2.0
...,...,...,...,...,...
1304,3.0,0.0,14.50000,1.0,0.0
1305,3.0,0.0,24.83858,1.0,0.0
1306,3.0,1.0,26.50000,0.0,0.0
1307,3.0,1.0,27.00000,0.0,0.0


In [25]:
y

0       1.0
1       1.0
2       0.0
3       0.0
4       0.0
       ... 
1304    0.0
1305    0.0
1306    0.0
1307    0.0
1308    0.0
Name: survived, Length: 1309, dtype: float64

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler = StandardScaler()

In [28]:
x_scaled = scaler.fit_transform(x)

In [29]:
x_scaled

array([[-1.54609786, -1.34499549, -0.02894925, -0.47908676, -0.4449995 ],
       [-1.54609786,  0.74349692, -2.12451835,  0.48128777,  1.86652569],
       [-1.54609786, -1.34499549, -2.04368276,  0.48128777,  1.86652569],
       ...,
       [ 0.84191642,  0.74349692, -0.21549865, -0.47908676, -0.4449995 ],
       [ 0.84191642,  0.74349692, -0.17818877, -0.47908676, -0.4449995 ],
       [ 0.84191642,  0.74349692, -0.02894925, -0.47908676, -0.4449995 ]])

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
regressor = LogisticRegression()

In [35]:
regressor.fit(x_train, y_train)

In [36]:
y_pred = regressor.predict(x_test)

In [37]:
y_pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1.,
       1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 0.

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [39]:
accuracy_score(y_test, y_pred)

0.7900763358778626

In [40]:
print(confusion_matrix(y_test, y_pred))

[[129  15]
 [ 40  78]]


In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.76      0.90      0.82       144
         1.0       0.84      0.66      0.74       118

    accuracy                           0.79       262
   macro avg       0.80      0.78      0.78       262
weighted avg       0.80      0.79      0.79       262

