In [1]:
import csv
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import math
from math import exp

##### Binomial logistic regression first takes the odds of the event happening for different levels of each independent variable, then takes the ratio of those odds (which is continuous but cannot be negative) and then takes the logarithm of that ratio (this is referred to as logit or log-odds) to create a continuous criterion as a transformed version of the dependent variable.

In [3]:
titanic_df = pd.read_csv('../Data/titanic.csv')

In [4]:
titanic_df.set_index('PassengerId', inplace=True)
titanic_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
target = titanic_df['Survived']
type(target)

pandas.core.series.Series

In [6]:
predictor = titanic_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']]

In [7]:
predictor.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64

##### Filling the null values in 'Age' with its mean value.

In [9]:
predictor['Age'].fillna(np.mean(predictor['Age']), inplace= True)

In [10]:
predictor = pd.get_dummies(predictor)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(predictor, target, test_size = 0.3)

In [12]:
x_train.shape, x_test.shape

((623, 9), (268, 9))

##### predicting the majority class which is 0

In [13]:
(titanic_df['Survived'] ==0).sum()/ len(titanic_df)

0.61616161616161613

In [16]:
class NotFittedError(Exception):
    pass

class LogisticClassifier1():
    '''This class implements LogisticRegression by finding the coefficients through mini-batch stochastic gradient 
    descent. Descent is calculated by differentiating the loss function wrt coefficients.
    '''
    def __init__(self, l_rate = 0.3, threshold = 0.5, fit_called=False):
        self.fit_called = fit_called
        self.coef = 0
        self.bias = 0
        self.threshold = threshold
        self.l_rate = l_rate
    
    def fit(self, xtrain, ytrain, n_epoch=50, l_rate =0.1):
        self.fit_called = True
        #standardize data
        xtrain = (xtrain - np.mean(xtrain, axis=0))/np.std(xtrain, axis=0)
        self.coef = np.array([0.0 for i in range(xtrain.shape[1])])
        error = self.cost_func(xtrain, ytrain)
        #print(error)
        epoch_error = []
        i = 0
        #mini-batch sgd
        for epoch in range(n_epoch):
            i += 1
            old_error = error
            x, _, y, _ = train_test_split(xtrain, ytrain, train_size = 0.4)
            yhat = self.predict_probability(x)
            delta = (yhat - y)
            self.bias -= l_rate*(delta.T.dot(np.ones(x.shape[0])))/x.shape[0]
            self.coef -= l_rate*(delta.T.dot(x))/x.shape[0]
            #print(self.bias, self.coef)
            error = self.cost_func(x, y)
            epoch_error.append(error)
            #print(abs(old_error - error))
            #print(epoch_error)
            if abs(old_error - error) <0.001:
                return (self.bias, self.coef)
        return self.bias, self.coef
    
    def predict_probability(self, row):
        if not self.fit_called:
            raise NotFittedError()
        #center the data 
        row = (row - np.mean(row, axis=0)) / np.std(row, axis=0) 
        return 1.0 / (1.0 + np.exp(-row.dot(self.coef) - self.bias))
    

    def cost_func(self, xtrain, ytrain):
        log_func_v = self.predict_probability(xtrain)
        y = ytrain
        step1 = y * np.log(log_func_v)
        step2 = (1-y) * np.log(1 - log_func_v)
        final = -step1 - step2
        return np.mean(final)
        
    def predict(self, X):
        if not self.fit_called:
            raise NotFittedError()
        
        X = (X - np.mean(X, axis=0)) / np.std(X, axis=0) 
        predictions = self.predict_probability(X)
        #predictions = np.array(1.0 / (1.0 + np.exp(-X.dot(self.coef) - self.bias)))
        #print(predictions)
        pred_value = np.where(predictions >= .5, 1, 0)
        return pred_value
        

    def score(self, x_test, y_test):
        yact_ypred = list(zip(y_test, self.predict(x_test)))
        TP = yact_ypred.count((1,1))/len(x_test)
        TN = yact_ypred.count((0,0))/len(x_test)
        FP = yact_ypred.count((0,1))/len(x_test)
        FN = yact_ypred.count((1,0))/len(x_test)
        return float(TP + TN)/float(TP + TN + FP + FN)

In [17]:
lr = LogisticClassifier1()
lr.fit(x_train, y_train)



(-0.091247490585579391,
 array([-0.12339692, -0.02446699, -0.01988788,  0.03041888,  0.19248428,
        -0.19248428,  0.05803686, -0.01244515, -0.04561483]))

In [18]:
lr.predict(x_train)

array([0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0,

In [19]:
lr.predict_probability(x_train)

PassengerId
569    0.430862
869    0.369900
772    0.361956
294    0.570400
440    0.403898
284    0.374577
546    0.424298
652    0.617399
698    0.582004
47     0.379331
670    0.633338
71     0.403448
483    0.361092
36     0.429984
44     0.685887
609    0.678178
535    0.567648
116    0.373701
90     0.372388
618    0.565084
61     0.434398
35     0.499647
593    0.362388
677    0.372169
673    0.386462
480    0.589675
582    0.694546
347    0.598540
367    0.677912
457    0.423841
         ...   
308    0.695220
833    0.430862
85     0.608834
290    0.585503
526    0.378793
430    0.368897
761    0.369900
684    0.373570
9      0.587547
125    0.438202
398    0.397160
43     0.430862
282    0.370641
795    0.371951
444    0.603922
407    0.360661
837    0.373701
58     0.431413
726    0.374139
805    0.371077
816    0.440044
604    0.363686
648    0.491033
802    0.607372
172    0.386962
665    0.369951
689    0.375015
303    0.374577
844    0.428661
789    0.396297
dtype: float

In [20]:
lr.score(x_train, y_train)

0.781701444622793

In [21]:
lr.score(x_test, y_test)

0.7835820895522387

## Sklearn LogisticRegression comparison

In [22]:
from sklearn.linear_model import LogisticRegression
sklr = LogisticRegression()
sklr.fit(x_train, y_train)
sklr.score(x_train, y_train)

0.797752808988764

In [24]:
sklr.score(x_test, y_test)

0.79850746268656714

In [25]:
sklr.coef_

array([[-0.98325928, -0.02904457, -0.29412128, -0.04231536,  2.24239116,
        -0.42593118,  0.92206888,  0.29488607,  0.25913677]])