In [1]:
import pandas as pd
data=pd.read_csv('titanic_processed.csv')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,29.0,0,0,7.775,0,0,1
1,0,3,1,45.5,0,0,7.225,1,0,0
2,1,1,0,24.0,0,0,69.3,1,0,0
3,1,1,1,0.92,1,2,151.55,0,0,1
4,1,1,1,36.0,0,0,26.2875,0,0,1


In [2]:
data.shape

(712, 10)

### Spliting our data for train and test

In [3]:
from sklearn.model_selection import train_test_split
x=data.drop('Survived',axis=1) # this column should be remove because we will predict it later
y=data['Survived'] # this is going to be the actual values 

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) # 20% test 80% train

In [4]:
x_train.shape,y_train.shape

((569, 9), (569,))

In [5]:
x_test.shape,y_test.shape

((143, 9), (143,))

### LogisticRegression Train the model

In [6]:
from sklearn.linear_model import LogisticRegression
# There is an L1 and L2 penalty, we use L2 to norm of the coefficients of our model (the sum of the squares of the coefficients)
# C=1.0: inverse of regularization strength - smaller values indicate stronger regularization
# the linlinear solver works well on small datasets 
logistic_model=LogisticRegression(penalty='l2',C=1.0,solver='liblinear').fit(x_train,y_train) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
y_pred=logistic_model.predict(x_test)

### Compare the actual and prediction values for our ML

In [8]:
pred_results=pd.DataFrame({'y_test':y_test,
                          'y_pred':y_pred})

In [9]:
pred_results.head()

Unnamed: 0,y_test,y_pred
366,0,0
531,1,0
608,0,0
74,0,0
122,1,1


### setup a confusion matrix using the pandas crosstab function

In [10]:
# we need to measure the performance of our logistic regression model
data_crosstab=pd.crosstab(pred_results.y_pred,pred_results.y_test)
data_crosstab
#The crosstab shown true positive and true negative cells (0=80, 1=32)
# that indicating that the accuracy of this model is pretty high
# 

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,72,18
1,12,41


In [11]:
# will use the right tool from sklearn to measure the performans
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [12]:
acc=accuracy_score(y_test,y_pred)
prec=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)

print('accuracy_score :',acc) # the model accuracy (how many predicted values get right)
print('precision_score :',prec) # how many passengers that the model thought survived acctually did survive
print('recall_score :',recall) # how many of the actual survivors did the model correctly predict?

accuracy_score : 0.7902097902097902
precision_score : 0.7735849056603774
recall_score : 0.6949152542372882


In [13]:
# This step is just to verify the accuracy_score
# Take the values in our confusion matrix
TP=data_crosstab[1][1] # True Positive prediction from our ML model (32) passenger survived and they actually did survive
TN=data_crosstab[0][0] # True Nagatives from our ML predictions (80) 
FP=data_crosstab[0][1] # False Positives (9) our ML thought the passenger survived, but the passenger actually did not.
FN=data_crosstab[1][0] # False negatives (20) our ML thought the passenger not survived, but the passenger actually did.

In [14]:
# here will use the confusion matrix to manually calcualte each of the scores
accuracy_score_verified=(TP+TN)/(TP+FP+TN+FN)
accuracy_score_verified
# here we can see that the number given by accuracy_score_verified it is exactly the same as accuracy_score : 0.7832167832167832

0.7902097902097902

In [15]:
# To calculate the precision_score
precision_score_survived=TP/(TP+FP)
precision_score_survived

0.7735849056603774

In [16]:
# To calculate the recall_score
recall_score_survived=TP/(TP+FN)
recall_score_survived

0.6949152542372882