In [3]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import accuracy_score
import sklearn
import sklearn.datasets
import sklearn.linear_model as l

In [4]:
def read_data(file_name):
    # Read the data with its path location
    try:
        data = pd.read_csv(file_name)
        return data
    except Exception:
        sys.exit(1)

In [5]:
def trainValTestSplit(data):
    shuffled = data.sample(frac=1, random_state=0)
    dataSize = len(shuffled)
    train = shuffled[:int(dataSize * 0.7)]
    val = shuffled[int(dataSize * 0.7):int(dataSize * 0.8)]
    test = shuffled[int(dataSize * 0.8):]
    return train, val, test

In [6]:
def normalize(X, min, max):
    X = (X - min) / (max - min)
    return X

In [7]:
def get_data(file_location):
    data = read_data(file_location)

    train, val, test = trainValTestSplit(data)
    minVal = train.iloc[:, :-1].min()
    maxVal = train.iloc[:, :-1].max()

    X_train = np.array(normalize(train.iloc[:, :-1], minVal, maxVal))
    X_val = np.array(normalize(val.iloc[:, :-1], minVal, maxVal))
    X_test = np.array(normalize(test.iloc[:, :-1], minVal, maxVal))

    y_train = np.array(train.iloc[:, -1:])
    y_val = np.array(val.iloc[:, -1:])
    y_test = np.array(test.iloc[:, -1:])

    return X_train, X_val, X_test, y_train, y_val, y_test

In [8]:
absolutePath = r'C:\Users\gulce\Desktop\EEE 8TH SEMESTER\CS 464\Homeworks\HW2\dataset.csv'
# absolutePath = input('Enter the file location of the dataset: ')
X_train, X_val, X_test, y_train, y_val, y_test = get_data(absolutePath)

In [9]:
print("X_train shape: {}".format(X_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_train shape: {}".format(y_train.shape))
print("y_val shape: {}".format(y_val.shape))
print("y_test shape: {}".format(y_test.shape))

X_train shape: (42000, 12)
X_val shape: (6000, 12)
X_test shape: (12000, 12)
y_train shape: (42000, 1)
y_val shape: (6000, 1)
y_test shape: (12000, 1)


In [None]:
clf = l.LogisticRegression()
clf.fit(X_train, y_train)

score = clf.score(X_test, y_test)
print(score)

In [None]:
def sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    return s

In [97]:
def gaussianInitialization(dimension):
    np.random.seed(9)
    w = np.random.normal(loc=0, scale=1, size=(dimension, 1))
    return w

In [100]:
def zeroInitialization(dimension):
    np.random.seed(9)
    w = np.zeros((dimension, 1))
    return w


In [101]:
class LogisticRegression:
    def __init__(self, epochs=100, learningRate=0.001, batchSize=64):
        self.epochs = epochs
        self.learningRate = learningRate
        self.batchSize = batchSize
        self.i = i

    def fit(self, X_train, y_train):
        m = X_train.shape[0]
        w = zeroInitialization(X_train.shape[1])
        b = zeroInitialization(1)
        for epoch in range(self.epochs):
            for batch in range(m//self.batchSize + 1):
                startIdx = batch*self.batchSize
                endIdx = 2*batch*self.batchSize
                if batch == m//self.batchSize:
                    prob = sigmoid(np.dot(X_train[startIdx:endIdx], w) + b)
                    dw = (1 / self.batchSize) * np.dot(X_train[startIdx:endIdx].T, (prob - y_train[startIdx:endIdx]))
                    db = (1 / self.batchSize) * np.sum(prob - y_train[startIdx:endIdx])
                else:
                    prob = sigmoid(np.dot(X_train[startIdx:], w) + b)
                    dw = (1 / self.batchSize) * np.dot(X_train[startIdx:].T, (prob - y_train[startIdx:]))
                    db = (1 / self.batchSize) * np.sum(prob - y_train[batch])
                w -= self.learningRate * dw
                b -= self.learningRate * db
            y_pred = model.predict(w, b, X_val, 0.5)
            print('Acc for epoch', epoch, 'is: ', accuracy_score(y_pred, y_val))
        return w, b

    def predict(self, w, b, X, threshold):
        y_pred = np.zeros(X.shape[0])
        #w = w.reshape(X.shape[1], 1)
        prob = sigmoid(np.dot(X, w) + b)
        for i in range(prob.shape[0]):
            if prob[i,0] > threshold:
                y_pred[i] = 1
            else:
                y_pred[i] = 0
        return y_pred

In [102]:
model = LogisticRegression(100, 0.001, 10000)
w, b = model.fit(X_train, y_train)

Acc for epoch 0 is:  0.6366666666666667
Acc for epoch 1 is:  0.6366666666666667
Acc for epoch 2 is:  0.6366666666666667
Acc for epoch 3 is:  0.6366666666666667
Acc for epoch 4 is:  0.6366666666666667
Acc for epoch 5 is:  0.6366666666666667
Acc for epoch 6 is:  0.6366666666666667
Acc for epoch 7 is:  0.6366666666666667
Acc for epoch 8 is:  0.6366666666666667
Acc for epoch 9 is:  0.6366666666666667
Acc for epoch 10 is:  0.6366666666666667
Acc for epoch 11 is:  0.6366666666666667
Acc for epoch 12 is:  0.6366666666666667
Acc for epoch 13 is:  0.6366666666666667
Acc for epoch 14 is:  0.6366666666666667
Acc for epoch 15 is:  0.6366666666666667
Acc for epoch 16 is:  0.6366666666666667
Acc for epoch 17 is:  0.6366666666666667
Acc for epoch 18 is:  0.6366666666666667
Acc for epoch 19 is:  0.6366666666666667
Acc for epoch 20 is:  0.6366666666666667
Acc for epoch 21 is:  0.6366666666666667
Acc for epoch 22 is:  0.6366666666666667
Acc for epoch 23 is:  0.6366666666666667
Acc for epoch 24 is:  0.63