In [76]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
import pandas as pd
import numpy as np
import random
import math

eta = 0.01   # learning rate
epoch = 1500 # iteration

In [2]:
def sigmoid(x):
    return 1.0/(1+ np.exp(-x))

def sigmoid_derivative(x):
    return x * (1.0 - x)

# Logistic Regression Model
class LogisticRegression:
    
    def __init__(self, x, w, y):
        self.inputs  = x
        self.weights = w               
        self.target  = y
        self.output  = np.zeros(self.target.shape)

    def forward_proc(self):
       # forward processing of inputs and weights using sigmoid activation function 
        self.output = sigmoid(np.dot(self.weights, self.inputs.T))

    def backprop(self):
        # backward processing of appling the chain rule to find derivative of the mean square error function with respect to weights
        dw = (self.output - self.target) * self.inputs # same formular for both linear and logistic regression

        # update the weights with the derivative of the loss function
        self.weights -= eta * dw

    def predict(self, x):
        # predict the output for a given input x
        return (sigmoid(np.dot(self.weights, x.T)))
        
    def calculate_error(self):
        # calculate error
        error = -self.target * math.log(self.output) - (1-self.target) * math.log(1-self.output)
        return abs(error)


In [77]:
if __name__ == "__main__":   
    # load dataset
    df = pd.read_csv("titanic_data.csv")
    
    # preprocess dataset by changing the string to integer, and filling in the missing values
    df['Sex'] = df['Sex'].map({'female':1,'male':0})
    df['Age'].fillna(value=df['Age'].mean(), inplace=True)

    # initially experiment with 100 samples. For final run, use full dataset
    #df = df.iloc[:100, :]
    
    # select proper features for prediction
    passengers = df[["Sex", "Age", "Pclass","Survived" ]]

    weights = np.random.rand(1, 3)
    print("Initial Weights:", weights)
    
    # split train and test set
    train, test = train_test_split(passengers, test_size=0.2)
    
    # select proper features for prediction
    train_features = train[["Sex", "Age", "Pclass"]].values
    test_features = test[["Sex", "Age", "Pclass"]].values
    train_labels = train[["Survived"]].values
    test_labels = test[["Survived"]].values
    
    # normalize data
    #scaler = StandardScaler()
    scaler = MinMaxScaler()
    train_features = scaler.fit_transform(train_features)
    test_features = scaler.fit_transform(test_features)
    
    
    # Performance measure
    # use the test features
    X = test_features
    y = test_labels
    w = weights # use the weights resulting from training
    y_predic = []
    for j in range(len(X)):
        model = LogisticRegression(X[j], w, y[j])
        if model.predict(X[j]) >= 0.5:
            y_predic.append(1)
        elif model.predict(X[j]) < 0.5:
            y_predic.append(0)
            
    for i in range(epoch):
   
        if i == 0: w = weights

        concat_data=np.concatenate((train_features, train_labels), axis = 1)
        np.random.shuffle(concat_data) # shuffle the training dataset
        # divide shuffled dataset to features X and labels y
        X = concat_data[:, 0:3]
        y = concat_data[:, 3:4]
        # eta *= 0.95  # decreasing learning rate is found to be not good for this case

        for j in range(len(X)): 

            model = LogisticRegression(X[j], w, y[j])
            model.forward_proc()   # forward processing
            model.backprop()       # backward processing
            w = model.weights 

        if (i % 100) == 0:
             print("Loss: ", model.calculate_error())

    #print("Output:", model.output)
    print("Adjusted Weights:", model.weights)

Initial Weights: [[0.50541821 0.35593132 0.72694152]]
Loss:  [0.42687431]
Loss:  [0.89277286]
Loss:  [0.20034853]
Loss:  [0.0926067]
Loss:  [0.2411087]
Loss:  [0.90807465]
Loss:  [0.09138899]
Loss:  [0.1060396]
Loss:  [0.87500321]
Loss:  [0.09314394]
Loss:  [0.09899494]
Loss:  [0.09045552]
Loss:  [0.09694965]
Loss:  [0.46865429]
Loss:  [0.24601882]
Adjusted Weights: [[ 2.63434658 -1.00998857 -1.98151406]]


In [73]:
# verify the output with the adjusted weights
x1 = np.array([[0.4, 1.0, 0.6]])
print ("Output for the input data [0.4, 1.0, 0.6]:", model.predict(x1))
x2 = np.array([[1.0, 0.3, 0.5]])
print ("Output for the input data [1.0, 0.3, 0.5]:", model.predict(x2))

# predicting and testing the output for a given input data
x_prediction = np.array([[0.7, 0.8, 0.4]])
predicted_output = model.predict(x_prediction)
print("Predicted data based on trained weights: ")
print("Input (scaled): ", x_prediction)
print("Output probability is : ", predicted_output)
if predicted_output >= 0.5:
    print("Predicted output is PASS.")
elif predicted_output < 0.5:
    print("Predicted output is Fail.")

Output for the input data [0.4, 1.0, 0.6]: [[0.1619917]]
Output for the input data [1.0, 0.3, 0.5]: [[0.7347245]]
Predicted data based on trained weights: 
Input (scaled):  [[0.7 0.8 0.4]]
Output probability is :  [[0.43869233]]
Predicted output is Fail.


In [74]:
# Confusion Matrix
confusion = confusion_matrix(test_labels, y_predic)
print('Confusion Matrix\n')
print(confusion)

Confusion Matrix

[[  0 109]
 [  0  70]]


In [82]:
# Precision Recall F1-score
results = confusion_matrix(test_labels, y_predic) 
print ('Confusion Matrix :') 
print(results) 
print ('Accuracy Score is', accuracy_score(test_labels, y_predic)) 
print('F1-score: {:.2f}\n'.format(f1_score(test_labels, y_predic)))
print ('Classification Report : ') 
print (classification_report(test_labels, y_predic))


Confusion Matrix :
[[  0 108]
 [  0  71]]
Accuracy Score is 0.39664804469273746
F1-score: 0.57

Classification Report : 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       108
           1       0.40      1.00      0.57        71

    accuracy                           0.40       179
   macro avg       0.20      0.50      0.28       179
weighted avg       0.16      0.40      0.23       179

