# Creating a logistic regression to predict Survival

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [2]:
data_preprocessed = pd.read_csv('Titanic_preprocessed.csv')
data_preprocessed.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cherbourg,Queenstown,Southampton,A,B,C,D,E,F,G,T,U,Survived
0,3,1,22,1,0,7.25,0,0,1,0,0,0,0,0,0,0,0,1,0
1,1,0,38,1,0,71.2833,1,0,0,0,0,1,0,0,0,0,0,0,1
2,3,0,26,0,0,7.925,0,0,1,0,0,0,0,0,0,0,0,1,1
3,1,0,35,1,0,53.1,0,0,1,0,0,1,0,0,0,0,0,0,1
4,3,1,35,0,0,8.05,0,0,1,0,0,0,0,0,0,0,0,1,0
5,3,1,29,0,0,8.4583,0,1,0,0,0,0,0,0,0,0,0,1,0
6,1,1,54,0,0,51.8625,0,0,1,0,0,0,0,1,0,0,0,0,0
7,3,1,2,3,1,21.075,0,0,1,0,0,0,0,0,0,0,0,1,0
8,3,0,27,0,2,11.1333,0,0,1,0,0,0,0,0,0,0,0,1,1
9,2,0,14,1,0,30.0708,1,0,0,0,0,0,0,0,0,0,0,1,1


## cheking the balance of the targets(the dataset has to be balanced to avoid biasing the model)

In [3]:
number_of_ones = data_preprocessed['Survived'].sum()
total_number_of_targets = data_preprocessed['Survived'].shape[0]

print('the number of 1s is :', number_of_ones)
print('the number of 0s is :' ,total_number_of_targets - number_of_ones)

the number of 1s is : 342
the number of 0s is : 549


In [4]:
num_zeros_to_keep = 342
current_zeros_count = total_number_of_targets - number_of_ones
zeros_to_remove = current_zeros_count - num_zeros_to_keep

zeros_removed = 0

for index, row in data_preprocessed.iterrows():
    if zeros_removed < zeros_to_remove and row['Survived'] == 0:
        data_preprocessed = data_preprocessed.drop(index)
        zeros_removed += 1

    if zeros_removed == zeros_to_remove:
        break

In [5]:
number_of_ones = data_preprocessed['Survived'].sum()
total_number_of_targets = data_preprocessed['Survived'].shape[0]

print('the number of 1s is :', number_of_ones)
print('the number of 0s is :' ,total_number_of_targets - number_of_ones)

the number of 1s is : 342
the number of 0s is : 342


the targets now are distributed 50% for 1's and 50% fro 0's, the dataset is sucessfully balanced

# Select the inputs and the targets for the regression

In [6]:
data_preprocessed.shape

(684, 19)

In [16]:
data_preprocessed

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cherbourg,Queenstown,Southampton,A,B,C,D,E,F,G,T,U,Survived
1,1,0,38,1,0,71.2833,1,0,0,0,0,1,0,0,0,0,0,0,1
2,3,0,26,0,0,7.9250,0,0,1,0,0,0,0,0,0,0,0,1,1
3,1,0,35,1,0,53.1000,0,0,1,0,0,1,0,0,0,0,0,0,1
8,3,0,27,0,2,11.1333,0,0,1,0,0,0,0,0,0,0,0,1,1
9,2,0,14,1,0,30.0708,1,0,0,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,1,27,0,0,13.0000,0,0,1,0,0,0,0,0,0,0,0,1,0
887,1,0,19,0,0,30.0000,0,0,1,0,1,0,0,0,0,0,0,0,1
888,3,0,29,1,2,23.4500,0,0,1,0,0,0,0,0,0,0,0,1,0
889,1,1,26,0,0,30.0000,1,0,0,0,0,1,0,0,0,0,0,0,1


In [7]:
targets = data_preprocessed['Survived'] 
targets

1      1
2      1
3      1
8      1
9      1
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 684, dtype: int64

In [8]:
unscaled_inputs = data_preprocessed.iloc[:,:-1]
unscaled_inputs

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cherbourg,Queenstown,Southampton,A,B,C,D,E,F,G,T,U
1,1,0,38,1,0,71.2833,1,0,0,0,0,1,0,0,0,0,0,0
2,3,0,26,0,0,7.9250,0,0,1,0,0,0,0,0,0,0,0,1
3,1,0,35,1,0,53.1000,0,0,1,0,0,1,0,0,0,0,0,0
8,3,0,27,0,2,11.1333,0,0,1,0,0,0,0,0,0,0,0,1
9,2,0,14,1,0,30.0708,1,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,1,27,0,0,13.0000,0,0,1,0,0,0,0,0,0,0,0,1
887,1,0,19,0,0,30.0000,0,0,1,0,1,0,0,0,0,0,0,0
888,3,0,29,1,2,23.4500,0,0,1,0,0,0,0,0,0,0,0,1
889,1,1,26,0,0,30.0000,1,0,0,0,0,1,0,0,0,0,0,0


## Standardize the data

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
exclude_columns = ['Sex', 'Cherbourg', 'Queenstown', 'Southampton', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U']

columns_to_standardize = [col for col in unscaled_inputs.columns if col not in exclude_columns]

In [11]:
scaler = StandardScaler()
unscaled_inputs[columns_to_standardize] = scaler.fit_transform(unscaled_inputs[columns_to_standardize])

In [12]:
scaled_inputs = unscaled_inputs.copy()
scaled_inputs

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cherbourg,Queenstown,Southampton,A,B,C,D,E,F,G,T,U
1,-1.477413,0,0.669585,0.559727,-0.476631,0.684166,1,0,0,0,0,1,0,0,0,0,0,0
2,0.875441,0,-0.261617,-0.492068,-0.476631,-0.498929,0,0,1,0,0,0,0,0,0,0,0,1
3,-1.477413,0,0.436784,0.559727,-0.476631,0.344628,0,0,1,0,0,1,0,0,0,0,0,0
8,0.875441,0,-0.184017,-0.492068,1.965435,-0.439020,0,0,1,0,0,0,0,0,0,0,0,1
9,-0.300986,0,-1.192818,0.559727,-0.476631,-0.085398,1,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.300986,1,-0.184017,-0.492068,-0.476631,-0.404163,0,0,1,0,0,0,0,0,0,0,0,1
887,-1.477413,0,-0.804818,-0.492068,-0.476631,-0.086720,0,0,1,0,1,0,0,0,0,0,0,0
888,0.875441,0,-0.028816,0.559727,1.965435,-0.209029,0,0,1,0,0,0,0,0,0,0,0,1
889,-1.477413,1,-0.261617,-0.492068,-0.476631,-0.086720,1,0,0,0,0,1,0,0,0,0,0,0


In [13]:
scaled_inputs.shape

(684, 18)

## Split the data into train & test and shuffle

### Import the relevant module

In [14]:
from sklearn.model_selection import train_test_split

### Split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [18]:
print (x_train.shape, y_train.shape)

(547, 18) (547,)


In [19]:
print (x_test.shape, y_test.shape)

(137, 18) (137,)


# Logistic regression with sklearn

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [28]:
model = LogisticRegression()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7372262773722628


the accuracy of the model is 74%

## Save the model

In [31]:
import pickle

# Save the trained model
model_file_path = 'C:/Users/User/Titanic Project/logistic_regression_model.pkl'

with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_file_path}')

Model saved to C:/Users/User/Titanic Project/logistic_regression_model.pkl
