In [73]:
import numpy as np
import pandas as pd
import random
from collections import Counter

In [2]:
class LoadPreprocessData:
    def __init__(self, path, header = None):
        self.dataset = pd.read_csv(path, header = header)
        self.X = self.dataset.iloc[:, :4]
        self.y = self.dataset.iloc[:, 4]
        
    def normalize(self, axis = 0):
        return ((self.X - self.X.mean(axis = axis)) / self.X.std(axis = axis))
    
    def train_test_split(self, percent = 80):
        train_rows = random.sample(range(0, self.y.size), percent * self.y.size // 100)
        train_rows.sort()
        test_rows=[rows for rows in self.X.index.values if rows not in train_rows]
        self.train_X = self.X.iloc[train_rows].reset_index(drop = True) 
        self.train_y = self.y.iloc[train_rows].reset_index(drop = True) 
        
        self.test_X = self.X.iloc[test_rows].reset_index(drop = True) 
        self.test_y = self.y.iloc[test_rows].reset_index(drop = True) 

In [3]:
dataClass = LoadPreprocessData("data_banknote_authentication.txt", None)
dataClass.normalize()
dataClass.train_test_split()
dataClass.train_X

Unnamed: 0,0,1,2,3
0,3.62160,8.66610,-2.8073,-0.44699
1,4.54590,8.16740,-2.4586,-1.46210
2,3.86600,-2.63830,1.9242,0.10645
3,0.32924,-4.45520,4.5718,-0.98880
4,4.36840,9.67180,-3.9606,-3.16250
...,...,...,...,...
1092,0.40614,1.34920,-1.4501,-0.55949
1093,-1.38870,-4.87730,6.4774,0.34179
1094,-3.75030,-13.45860,17.5932,-2.77710
1095,-3.56370,-8.38270,12.3930,-1.28230


In [4]:
class LogisticRegression:
    def __init__(self, lr=0.01, reg_factor = 0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
        self.reg_factor = reg_factor
    def initialize_weights(self, X, weights = "0"):
        if weights == "0":
            self.theta = np.zeros(X.shape[1])
        elif weights == "gaussian":
            self.theta = np.random.randn(X.shape[1])
        elif weights == "uniform":
            self.theta = np.random.uniform(size = X.shape[1])
        elif weights == "xavier":
            self.theta = np.random.randn(X.shape[1]) * np.sqrt(1 / X.shape[1])
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def __loss(self, h, y, type = "no regularisation"):
        if(type == "no regularisation"):
            return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
        
        elif type == "L1":
            return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean() - 0.5 * self.reg_factor * np.sum(np.abs(self.theta)) / y.size
        
        else:
            return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean() - 0.5 * self.reg_factor * np.sum(self.theta ** 2) / y.size
    
    def fit(self, X, y , type = "L1", weights = "0"):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        self.initialize_weights(X, weights)
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            if type == "no regularisation":
                gradient = np.dot(X.T, (h - y))
                
            elif type == "L1":
                gradient = np.dot(X.T, (h - y)) + 0.5 * self.reg_factor * np.sign(self.theta)
                
            else:
                gradient = np.dot(X.T, (h - y)) + 0.5 * self.reg_factor * self.theta 
                
            self.theta -= self.lr * gradient / y.size
            
            if(self.verbose == True and i % 5000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y, type)} Iteration: {i}\t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold

In [74]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "uniform")

loss: 1.5348812181729263 Iteration: 0	
loss: 0.025434507474573684 Iteration: 5000	
loss: 0.02290929366763632 Iteration: 10000	
loss: 0.021857439151567452 Iteration: 15000	
Wall time: 5.11 s


In [75]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [76]:
weights = model.theta
weights

array([ 4.73859538, -4.91054263, -2.62109194, -3.27117166, -0.22696422])

In [77]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [78]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [79]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [80]:
accuracy.F_score()
accuracy.fScore

0.992

In [88]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "xavier")

loss: 0.4352976799653367 Iteration: 0	
loss: 0.02542685763807213 Iteration: 5000	
loss: 0.022906757279694656 Iteration: 10000	
loss: 0.02185613518398918 Iteration: 15000	
Wall time: 5.04 s


In [89]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [90]:
weights = model.theta
weights

array([ 4.73904195, -4.91107422, -2.62135814, -3.27151962, -0.22701152])

In [91]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [92]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [93]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [94]:
accuracy.F_score()
accuracy.fScore

0.992

In [95]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "0")

loss: 0.510235539857985 Iteration: 0	
loss: 0.02544406161274025 Iteration: 5000	
loss: 0.02291245371002464 Iteration: 10000	
loss: 0.021859062954461057 Iteration: 15000	
Wall time: 5.04 s


In [96]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [97]:
weights = model.theta
weights

array([ 4.73803964, -4.90988108, -2.62076067, -3.27073864, -0.22690537])

In [98]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [99]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [100]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [101]:
accuracy.F_score()
accuracy.fScore

0.992

In [102]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "gaussian")

loss: 1.7712402770005047 Iteration: 0	
loss: 0.02534036625794609 Iteration: 5000	
loss: 0.022877847049182942 Iteration: 10000	
loss: 0.0218412326525956 Iteration: 15000	
Wall time: 5.12 s


In [103]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [104]:
weights = model.theta
weights

array([ 4.74416421, -4.91717153, -2.62441156, -3.27551085, -0.2275545 ])

In [105]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [106]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [107]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [108]:
accuracy.F_score()
accuracy.fScore

0.992

In [109]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "0")

loss: 0.6628869831975659 Iteration: 0	
loss: 0.049656686307088964 Iteration: 5000	
loss: 0.03798090977608561 Iteration: 10000	
loss: 0.03343014898820411 Iteration: 15000	
Wall time: 5.04 s


In [110]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [111]:
weights = model.theta
weights

array([ 2.71518456, -2.60527573, -1.46601294, -1.75917724, -0.13022544])

In [112]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [113]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [114]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [115]:
accuracy.F_score()
accuracy.fScore

0.992

In [116]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "uniform")

loss: 1.7552288476969102 Iteration: 0	
loss: 0.04635276380457458 Iteration: 5000	
loss: 0.03695113212824926 Iteration: 10000	
loss: 0.032929502963661574 Iteration: 15000	
Wall time: 5.03 s


In [117]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [118]:
weights = model.theta
weights

array([ 2.75483066, -2.62972526, -1.47661618, -1.77478445, -0.12361187])

In [119]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [120]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [121]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [122]:
accuracy.F_score()
accuracy.fScore

0.992

In [123]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "xavier")

loss: 3.6480945320815197 Iteration: 0	
loss: 0.049036396587628206 Iteration: 5000	
loss: 0.037762760625133646 Iteration: 10000	
loss: 0.03331294473274763 Iteration: 15000	
Wall time: 5.16 s


In [124]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [125]:
weights = model.theta
weights

array([ 2.72054448, -2.61344883, -1.47080603, -1.76502575, -0.13127609])

In [126]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [127]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [128]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [129]:
accuracy.F_score()
accuracy.fScore

0.992

In [130]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "gaussian")

loss: 2.5602605679597636 Iteration: 0	
loss: 0.050845274598387974 Iteration: 5000	
loss: 0.038148466089729 Iteration: 10000	
loss: 0.03345815068262117 Iteration: 15000	
Wall time: 5.1 s


In [131]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9927272727272727

In [132]:
weights = model.theta
weights

array([ 2.70349294, -2.61159051, -1.47221469, -1.76494879, -0.1375662 ])

In [133]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [134]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [135]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9927272727272727

In [136]:
accuracy.F_score()
accuracy.fScore

0.992

In [137]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "0")

loss: 0.6928298003995645 Iteration: 0	
loss: 0.34669132798085256 Iteration: 5000	
loss: 0.26466226321792974 Iteration: 10000	
loss: 0.22424002162157078 Iteration: 15000	
Wall time: 5.09 s


In [138]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9563636363636364

In [139]:
weights = model.theta
weights

array([ 0.103434  , -0.66397523, -0.33266704, -0.29078877, -0.15172144])

In [140]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [141]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [142]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9563636363636364

In [143]:
accuracy.F_score()
accuracy.fScore

0.9500000000000001

In [151]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "uniform")

loss: 1.3879237745945316 Iteration: 0	
loss: 0.515464917384462 Iteration: 5000	
loss: 0.2949066850047931 Iteration: 10000	
loss: 0.22188437330343003 Iteration: 15000	
Wall time: 5.16 s


In [152]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9672727272727273

In [153]:
weights = model.theta
weights

array([ 0.61178288, -0.73275312, -0.26832165, -0.298004  ,  0.13764107])

In [154]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [155]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [156]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9672727272727273

In [157]:
accuracy.F_score()
accuracy.fScore

0.9632653061224489

In [158]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "xavier")

loss: 2.966397451973528 Iteration: 0	
loss: 0.4792287621876961 Iteration: 5000	
loss: 0.3565550566408354 Iteration: 10000	
loss: 0.30037147485099425 Iteration: 15000	
Wall time: 5.08 s


In [159]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9163636363636364

In [160]:
weights = model.theta
weights

array([-0.47376856, -0.64098585, -0.27199619, -0.19417951, -0.18902487])

In [161]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [162]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [163]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9163636363636364

In [164]:
accuracy.F_score()
accuracy.fScore

0.8986784140969163

In [165]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "gaussian")

loss: 2.971856764442229 Iteration: 0	
loss: 0.21595832206408289 Iteration: 5000	
loss: 0.16481537931465784 Iteration: 10000	
loss: 0.140503855230283 Iteration: 15000	
Wall time: 5.1 s


In [166]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9781818181818182

In [167]:
weights = model.theta
weights

array([ 1.25079146, -0.8988991 , -0.3838311 , -0.46296851,  0.15844079])

In [168]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [169]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [170]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9781818181818182

In [171]:
accuracy.F_score()
accuracy.fScore

0.976