In [84]:
import numpy as np
import pandas as pd
import random
from collections import Counter

In [45]:
class LoadPreprocessData:
    def __init__(self, path, header = None):
        self.dataset = pd.read_csv(path, header = header)
        self.X = self.dataset.iloc[:, :4]
        self.y = self.dataset.iloc[:, 4]
        
    def normalize(self, axis = 0):
        return ((self.X - self.X.mean(axis = axis)) / self.X.std(axis = axis))
    
    def train_test_split(self, percent = 80):
        train_rows = random.sample(range(0, self.y.size), percent * self.y.size // 100)
        train_rows.sort()
        test_rows=[rows for rows in self.X.index.values if rows not in train_rows]
        self.train_X = self.X.iloc[train_rows].reset_index(drop = True) 
        self.train_y = self.y.iloc[train_rows].reset_index(drop = True) 
        
        self.test_X = self.X.iloc[test_rows].reset_index(drop = True) 
        self.test_y = self.y.iloc[test_rows].reset_index(drop = True) 

In [46]:
dataClass = LoadPreprocessData("data_banknote_authentication.txt", None)
dataClass.normalize()
dataClass.train_test_split()
dataClass.train_X

Unnamed: 0,0,1,2,3
0,3.62160,8.66610,-2.8073,-0.44699
1,4.54590,8.16740,-2.4586,-1.46210
2,3.86600,-2.63830,1.9242,0.10645
3,3.45660,9.52280,-4.0112,-3.59440
4,0.32924,-4.45520,4.5718,-0.98880
...,...,...,...,...
1092,0.40614,1.34920,-1.4501,-0.55949
1093,-1.38870,-4.87730,6.4774,0.34179
1094,-3.75030,-13.45860,17.5932,-2.77710
1095,-3.56370,-8.38270,12.3930,-1.28230


In [47]:
class LogisticRegression:
    def __init__(self, lr=0.01, reg_factor = 0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
        self.reg_factor = reg_factor
    def initialize_weights(self, X, weights = "0"):
        if weights == "0":
            self.theta = np.zeros(X.shape[1])
        elif weights == "gaussian":
            self.theta = np.random.randn(X.shape[1])
        elif weights == "uniform":
            self.theta = np.random.uniform(size = X.shape[1])
        elif weights == "xavier":
            self.theta = np.random.randn(X.shape[1]) * np.sqrt(1 / X.shape[1])
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def __loss(self, h, y, type = "no regularisation"):
        if(type == "no regularisation"):
            return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
        
        elif type == "L1":
            return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean() - 0.5 * self.reg_factor * np.sum(np.abs(self.theta)) / y.size
        
        else:
            return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean() - 0.5 * self.reg_factor * np.sum(self.theta ** 2) / y.size
    
    def fit(self, X, y , type = "L2", weights = "0"):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        self.initialize_weights(X, weights)
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            if type == "no regularisation":
                gradient = np.dot(X.T, (h - y))
                
            elif type == "L1":
                gradient = np.dot(X.T, (h - y)) + 0.5 * self.reg_factor * np.sign(self.theta)
                
            else:
                gradient = np.dot(X.T, (h - y)) + 0.5 * self.reg_factor * self.theta 
                
            self.theta -= self.lr * gradient / y.size
            
            if(self.verbose == True and i % 5000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y, type)} Iteration: {i}\t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold

In [85]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "uniform")

loss: 1.675200920653825 Iteration: 0	
loss: 0.02424214398614986 Iteration: 5000	
loss: 0.021438715285342105 Iteration: 10000	
loss: 0.020172786091272415 Iteration: 15000	
Wall time: 13.8 s


In [86]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [87]:
weights = model.theta
weights

array([ 4.73449056, -5.11265963, -2.74830395, -3.39997075, -0.2581917 ])

In [88]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [89]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [90]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [91]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [92]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "xavier")

loss: 0.27736067065855097 Iteration: 0	
loss: 0.02423264072309868 Iteration: 5000	
loss: 0.0214352735045491 Iteration: 10000	
loss: 0.020170891126263035 Iteration: 15000	
Wall time: 14.6 s


In [93]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [94]:
weights = model.theta
weights

array([ 4.73503851, -5.11335792, -2.74865104, -3.40042049, -0.25825942])

In [95]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [96]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [97]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [98]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [99]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "0")

loss: 0.509406850758869 Iteration: 0	
loss: 0.024298635586253437 Iteration: 5000	
loss: 0.021459006815230403 Iteration: 10000	
loss: 0.020183939577149163 Iteration: 15000	
Wall time: 14.6 s


In [100]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [101]:
weights = model.theta
weights

array([ 4.7312718 , -5.10855791, -2.74626515, -3.39732902, -0.25779409])

In [102]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [103]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [104]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [105]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [109]:
model = LogisticRegression(lr=0.1, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "gaussian")

loss: 3.549175298723947 Iteration: 0	
loss: 0.02433049451771495 Iteration: 5000	
loss: 0.021470342949200284 Iteration: 10000	
loss: 0.02019015695694993 Iteration: 15000	
Wall time: 15.1 s


In [110]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [111]:
weights = model.theta
weights

array([ 4.72948235, -5.10627758, -2.74513171, -3.39586037, -0.25757312])

In [112]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [113]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [114]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [115]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [116]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "0")

loss: 0.6628124020031013 Iteration: 0	
loss: 0.04924654705850929 Iteration: 5000	
loss: 0.03732766948559046 Iteration: 10000	
loss: 0.03266024046983874 Iteration: 15000	
Wall time: 15.3 s


In [117]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [118]:
weights = model.theta
weights

array([ 2.69616006, -2.64669282, -1.50026401, -1.78476714, -0.10017378])

In [119]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [120]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [121]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [122]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [123]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "uniform")

loss: 4.085047315419675 Iteration: 0	
loss: 0.04968760838235737 Iteration: 5000	
loss: 0.037458622104509724 Iteration: 10000	
loss: 0.03272421722859906 Iteration: 15000	
Wall time: 14.5 s


In [124]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [125]:
weights = model.theta
weights

array([ 2.69207466, -2.64326713, -1.498501  , -1.78243257, -0.10047021])

In [126]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [127]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [128]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [129]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [130]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "xavier")

loss: 0.2679312664296707 Iteration: 0	
loss: 0.04528540802130191 Iteration: 5000	
loss: 0.03604989721798137 Iteration: 10000	
loss: 0.03201897161086677 Iteration: 15000	
Wall time: 15 s


In [131]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [132]:
weights = model.theta
weights

array([ 2.74045543, -2.68149747, -1.51764326, -1.80816848, -0.09585506])

In [133]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [134]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [135]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [136]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [137]:
model = LogisticRegression(lr=0.01, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "gaussian")

loss: 0.21331740317697462 Iteration: 0	
loss: 0.04116705429617884 Iteration: 5000	
loss: 0.03446788588938644 Iteration: 10000	
loss: 0.031156000953232784 Iteration: 15000	
Wall time: 14.9 s


In [138]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9854545454545455

In [139]:
weights = model.theta
weights

array([ 2.80102662, -2.73559621, -1.54568004, -1.8449781 , -0.09262387])

In [140]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [141]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [142]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9854545454545455

In [143]:
accuracy.F_score()
accuracy.fScore

0.983050847457627

In [144]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "0")

loss: 0.6928290524614211 Iteration: 0	
loss: 0.3458834252299731 Iteration: 5000	
loss: 0.2640747576374346 Iteration: 10000	
loss: 0.223854443663995 Iteration: 15000	
Wall time: 14.4 s


In [145]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9418181818181818

In [146]:
weights = model.theta
weights

array([ 0.10156493, -0.67224361, -0.33153082, -0.27966781, -0.13711002])

In [147]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [148]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [149]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9418181818181818

In [150]:
accuracy.F_score()
accuracy.fScore

0.9285714285714286

In [151]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "uniform")

loss: 2.4677734903104547 Iteration: 0	
loss: 0.9567065628156952 Iteration: 5000	
loss: 0.40553399961455855 Iteration: 10000	
loss: 0.26310648210042603 Iteration: 15000	
Wall time: 15.1 s


In [152]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9272727272727272

In [153]:
weights = model.theta
weights

array([ 0.52407638, -0.66689915, -0.23574021, -0.23762148,  0.14702761])

In [154]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [155]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [156]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9272727272727272

In [157]:
accuracy.F_score()
accuracy.fScore

0.9090909090909091

In [158]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "xavier")

loss: 1.690305098055024 Iteration: 0	
loss: 0.7194031125123025 Iteration: 5000	
loss: 0.36247927277108816 Iteration: 10000	
loss: 0.26822901181092973 Iteration: 15000	
Wall time: 14.6 s


In [159]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9490909090909091

In [160]:
weights = model.theta
weights

array([ 0.00182642, -0.56190925, -0.3220027 , -0.24925233, -0.17605471])

In [161]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [162]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [163]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9490909090909091

In [164]:
accuracy.F_score()
accuracy.fScore

0.9380530973451328

In [165]:
model = LogisticRegression(lr=0.0001, num_iter=20000, verbose = True)
%time model.fit(dataClass.train_X, dataClass.train_y, weights = "gaussian")

loss: 2.0833848660155394 Iteration: 0	
loss: 0.714689082948707 Iteration: 5000	
loss: 0.3195781201611715 Iteration: 10000	
loss: 0.22787683794116478 Iteration: 15000	
Wall time: 14.3 s


In [166]:
preds = model.predict(dataClass.test_X, threshold = 0.5)
# accuracy
(preds == dataClass.test_y).mean()

0.9818181818181818

In [167]:
weights = model.theta
weights

array([ 0.53274848, -0.56993014, -0.3754901 , -0.32948634, -0.15240939])

In [168]:
predicted = model.predict(dataClass.test_X, threshold = 0.5)
actual = dataClass.test_y

In [169]:
class Accuracy:
    def __init__(self, predicted, actual):
        self.predicted = predicted
        self.actual = actual
    def accuracy(self):
        return (self.predicted == self.actual).mean()
    def F_score(self):
        counter = Counter(zip(self.predicted, self.actual))
        self.truePositive = counter[True, 1]
        self.falsePositive = counter[True, 0]
        self.trueNegative = counter[False, 0]
        self.falseNegative = counter[False, 1]
        
        self.precision = self.truePositive / (self.truePositive + self.falsePositive)
        self.recall = self.truePositive / (self.truePositive + self.falseNegative)
        self.fScore = (2 * self.precision * self.recall) / (self.precision + self.recall)

In [170]:
accuracy = Accuracy(predicted, actual)
accuracy.accuracy()

0.9818181818181818

In [171]:
accuracy.F_score()
accuracy.fScore

0.9789029535864979