In [1]:
import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import StandardScaler 


class Datapoint():
    def __init__(self, km, price):
        self.km = km
        self.price = price


    def __str__(self):
        return (f"km: {self.km}, price {self.price}")


    def __repr__(self):
        return (f"km: {self.km}, price {self.price}")

class Dataset():
    def __init__(self, path, standardize = True):
        self.data = None
        self.i = -1
        try:
            self.read_csv(path)
        except Exception as e:
            print(f"Please give a valid input, only numeric data is accepted\n{e}")
            raise ValueError
        self.standardized = False
        if (self.standardize):
            self.standardize()
        self.add_ones_to_x()


    def destandardize(self):
        self.x[:,1:] = self.x_scaler.inverse_transform(self.x[:,1:])
        self.y = self.y_scaler.inverse_transform(self.y[:, np.newaxis])
        self.y = np.reshape(self.y, self.y.shape[0])


    def standardize(self):
        self.standardized = True
        self.x_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        self.y = self.y[:, np.newaxis]
        self.x_scaler.fit(self.x)
        self.y_scaler.fit(self.y)
        self.x = self.x_scaler.transform(self.x)
        self.y = self.y_scaler.transform(self.y)
        self.y = np.reshape(self.y, self.y.shape[0])


    def read_csv(self, path):
        self.data = pd.read_csv(path, dtype = np.float64).to_numpy()
        try:
            self.p = self.data.shape[1] - 1
            self.m = self.data.shape[0]
        except:
            logging.error(f"Input needs to have at least two dimensions. Input dimension was {self.data.shape}")
            raise ValueError

        self.x = self.data[:, [x for x in range(self.p)]]
        self.y = self.data[:, self.p]


    def add_ones_to_x(self):
        self.x = np.concatenate((np.ones([self.m, 1], dtype = self.x.dtype), self.x), axis = 1)


    def __getitem__(self, i):
        return (self.x[i], self.y[i])
    

    def __len__(self):
        return (self.data.shape[0])

    
    def __iter__(self):
        self.i = -1
        return (self)


    def __next__(self):
        self.i += 1
        if (self.i < len(self)):
            return self[self.i]
        else:
            self.i = -1
            raise StopIteration

In [2]:
float_formatter = "{:.2E}".format
np.set_printoptions(formatter={'float_kind':float_formatter})
from datetime import datetime

def normalize(a):
    col_sums = a.sum(axis=0)
    new_matrix = a / col_sums[np.newaxis, :]
    return new_matrix

class Shaman():
    def __init__(self, dataset):
        self.p = dataset.p
        self.dataset = dataset
        self.thetas = np.zeros([self.p + 1], dtype = float)
        self.old_thetas = self.thetas
        self.lr = 1.0
        self.lr_decay = 1.0 / 2
        self.mininal_improvement = 1.0 / 1000
        self.oldcost = 0.0
        self.c = 1.0 / 2
        self.lr_increase = 1.5
        self.start_time = None
        self.time_limit = 2

    def time_stop(self):
        return ((datetime.now() - self.start_time).seconds >= 2)

    def predict(self, data, thetas = None):
        if thetas is None:
            thetas = self.thetas
        return (np.dot(data, thetas))


    def error(self, thetas = None):
        if thetas is None:
            thetas = self.thetas
        predictions = self.predict(self.dataset.x, thetas)
        error = predictions - self.dataset.y
        return (error)


    def mean_squared_error(self, thetas = None):
        if thetas is None:
            thetas = self.thetas   
        squared_error = np.square(self.error(thetas))
        return (np.mean(squared_error) / 2)


    def compute_gradients(self):
        error = self.error()
        gradients = np.dot(error, self.dataset.x)
        gradients = gradients / len(self.dataset.y)
        return (gradients)


    def ajimo_goldstein_condition(self, l2_grad_squared, gradients, lr):
        thetas = self.thetas - lr * gradients
        cost = self.mean_squared_error(thetas)
        objective = self.newcost - (self.c * lr * l2_grad_squared)
        return (cost <= objective)


    def ajimo(self, gradients):
        l2_grad_squared = np.square(gradients).sum()
        lr = self.lr * self.lr_increase
        while (not self.ajimo_goldstein_condition(l2_grad_squared, gradients, lr)):
            lr = lr * self.lr_decay
        self.lr = lr

    def update_thetas(self):
        self.old_thetas = self.thetas
        gradients = self.compute_gradients()
        self.ajimo(gradients)
        self.thetas = self.thetas - (self.lr * gradients)

    
    def update_costs(self):
        tmpold = self.oldcost
        self.oldcost = self.newcost
        self.newcost = self.mean_squared_error()
        return tmpold


    def undo_update_costs(self, tmpold):
        self.newcost =self.oldcost
        self.oldcost = tmpold


    def training_loop(self):
        self.start_time = datetime.now()
        self.newcost = self.mean_squared_error()
        while (not self.time_stop()):
            tmpold = self.oldcost
            self.update_thetas()
            tmpold = self.update_costs()
            if (self.newcost > self.oldcost):
                print("lol")
                self.lr = self.lr * self.lr_decay
                self.thetas = self.old_thetas
                self.undo_update_costs(tmpold)


    def middle_error(self):
        middle_thetas = (self.thetas + self.old_thetas) / 2
        return self.mean_squared_error(middle_thetas)

    
    def should_i_stop(self):
        if abs(self.oldcost - self.newcost) > self.mininal_improvement:
            return False
        if abs(self.middle_error() - self.newcost) > self.mininal_improvement:
            return False
        return True


    def unstandardize_thetas(self):
        self.thetas[1:] = self.thetas[1:] / self.dataset.x_scaler.scale_
        self.thetas[0] = self.thetas[0] - np.dot(self.thetas[1:], self.dataset.x_scaler.mean_)
        self.thetas = self.thetas * self.dataset.y_scaler.scale_
        self.thetas[0] += self.dataset.y_scaler.mean_


    def write_thetas_to_file(self, filename="thetas.csv"):
        self.unstandardize_thetas()
        with open(filename, "w+") as file:
            file.write(",".join([str(x) for x in self.thetas]))


    def __str__(self):
        return (f"Cost: {self.newcost}, Thetas: {self.thetas}, LR {self.lr:4.2E}")


In [31]:
from datetime import datetime
now = datetime.now()
d = Dataset("Fish.csv")
shaman = Shaman(d)
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
shaman.training_loop()
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)
shaman.write_thetas_to_file()
# # shaman.predict(d.x[0])
# shaman.predict(d.x)
# print(shaman)
# shaman.update_thetas(d)
# print(shaman)
# shaman.compute_gradients(d)

Current Time = 13:28:14
Current Time = 13:28:16


In [5]:
d = Dataset("data/data.csv")

In [8]:
d.destandardize()
print(d.x, d.y)

[[1.00E+00 2.40E+05]
 [1.00E+00 1.40E+05]
 [1.00E+00 1.50E+05]
 [1.00E+00 1.86E+05]
 [1.00E+00 1.76E+05]
 [1.00E+00 1.15E+05]
 [1.00E+00 1.67E+05]
 [1.00E+00 8.90E+04]
 [1.00E+00 1.44E+05]
 [1.00E+00 8.40E+04]
 [1.00E+00 8.20E+04]
 [1.00E+00 6.31E+04]
 [1.00E+00 7.40E+04]
 [1.00E+00 9.75E+04]
 [1.00E+00 6.70E+04]
 [1.00E+00 7.60E+04]
 [1.00E+00 4.82E+04]
 [1.00E+00 9.30E+04]
 [1.00E+00 6.09E+04]
 [1.00E+00 6.57E+04]
 [1.00E+00 5.40E+04]
 [1.00E+00 6.85E+04]
 [1.00E+00 2.29E+04]
 [1.00E+00 6.18E+04]] [3.65E+03 3.80E+03 4.40E+03 4.45E+03 5.25E+03 5.35E+03 5.80E+03 5.99E+03
 6.00E+03 6.20E+03 6.39E+03 6.39E+03 6.60E+03 6.80E+03 6.80E+03 6.90E+03
 6.90E+03 6.99E+03 7.49E+03 7.56E+03 7.99E+03 7.99E+03 7.99E+03 8.29E+03]


In [29]:
print(shaman)
float_formatter = "{:.0f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})
for x, y in shaman.dataset:
    print(f"{int(d.y_scaler.inverse_transform([shaman.predict(x)])[0]), int(d.y_scaler.inverse_transform([y])[0])}")

# print(np.mean(np.square([d.y_scaler.inverse_transform([shaman.predict(x)]) - d.y_scaler.inverse_transform([y])for x, y in shaman.dataset])))
#     # print(f"{shaman.predict(x)}\t {y}")

Cost: 0.028371475606962415, Thetas: [7.22E-16 2.78E-01 3.36E-01 -4.21E-01 6.03E-01 1.97E-01], LR 1.95E-09
(225, 242)
(307, 290)
(327, 340)
(369, 363)
(382, 430)
(447, 450)
(508, 500)
(385, 390)
(478, 450)
(502, 500)
(517, 475)
(512, 500)
(458, 500)
(505, 340)
(578, 600)
(632, 600)
(589, 700)
(585, 700)
(641, 610)
(600, 650)
(630, 575)
(682, 685)
(647, 620)
(686, 680)
(709, 700)
(733, 725)
(745, 720)
(751, 714)
(794, 850)
(958, 1000)
(902, 920)
(906, 955)
(985, 925)
(1011, 975)
(922, 950)


In [20]:
# shaman.dataset.x[:,1:] = shaman.dataset.x_scaler.inverse_transform(shaman.dataset.x[:,1:])
shaman.predict(shaman.dataset.x)
# shaman.unstandardize_thetas()

array([2.26E+02, 3.07E+02, 3.27E+02, 3.69E+02, 3.82E+02, 4.47E+02,
       5.09E+02, 3.86E+02, 4.78E+02, 5.03E+02, 5.17E+02, 5.13E+02,
       4.59E+02, 5.05E+02, 5.78E+02, 6.33E+02, 5.90E+02, 5.85E+02,
       6.41E+02, 6.00E+02, 6.30E+02, 6.82E+02, 6.47E+02, 6.87E+02,
       7.09E+02, 7.33E+02, 7.45E+02, 7.52E+02, 7.95E+02, 9.59E+02,
       9.02E+02, 9.07E+02, 9.86E+02, 1.01E+03, 9.22E+02])

In [32]:
shaman.thetas

array([-9.39E+02, 1.62E+01, 1.80E+01, -2.12E+01, 6.42E+01, 5.71E+01])

In [34]:
np.genfromtxt('thetas.csv', delimiter=',')

array([8.50E+03, -2.14E-02])

In [95]:
shaman.thetas[1:] = shaman.thetas[1:] / shaman.dataset.x_scaler.scale_
shaman.thetas[0] = shaman.thetas[0] - np.dot(shaman.thetas[1:], shaman.dataset.x_scaler.mean_)
shaman.thetas = shaman.thetas * shaman.dataset.y_scaler.scale_
shaman.thetas[0] += shaman.dataset.y_scaler.mean_

In [97]:
shaman.predict([1,23.2,25.4,30,11.52,4.02])

225.8156460509479