In [229]:
import pandas as pd
import logging


class Datapoint():
    def __init__(self, km, price):
        self.km = km
        self.price = price


    def __str__(self):
        return (f"km: {self.km}, price {self.price}")


    def __repr__(self):
        return (f"km: {self.km}, price {self.price}")

class Dataset():
    def __init__(self, path):
        self.data = None
        self.i = -1
        self.read_csv(path)


    def read_csv(self, path):
        self.data = pd.read_csv(path).to_numpy()
        try:
            self.p = self.data.shape[1] - 1
            self.m = self.data.shape[0]
        except:
            logging.error(f"Input needs to have at least two dimensions. Input dimension was {self.data.shape}")
            raise ValueError

        self.x = self.data[:, [x for x in range(self.p)]]
        self.x = np.concatenate((np.ones([self.m, 1], dtype = self.x.dtype), self.x), axis = 1)
        self.y = self.data[:, self.p]


    def __getitem__(self, i):
        return (self.x[i], self.y[i])
    

    def __len__(self):
        return (self.data.shape[0])

    
    def __iter__(self):
        self.i = -1
        return (self)

    def __next__(self):
        self.i += 1
        if (self.i < len(self)):
            return self[self.i]
        else:
            self.i = -1
            raise StopIteration

In [248]:
float_formatter = "{:.2E}".format
np.set_printoptions(formatter={'float_kind':float_formatter})

def normalize(a):
    col_sums = a.sum(axis=0)
    new_matrix = a / col_sums[np.newaxis, :]
    return new_matrix

class Shaman():
    def __init__(self, dataset):
        self.p = dataset.p
        self.thetas = np.zeros([self.p + 1], dtype = float)
        self.old_thetas = self.thetas
        self.lr = 1.0
        self.lr_decay = 1.0 / 2
        self.mininal_improvement = 0.1
        self.newcost = 0.0
        self.oldcost = 0.0


    def predict(self, data, thetas = None):
        if thetas is None:
            thetas = self.thetas
        return (np.dot(data, thetas))


    def error(self, dataset, thetas = None):
        if thetas is None:
            thetas = self.thetas
        predictions = self.predict(dataset.x, thetas)
        error = predictions - dataset.y
        return (error)


    def mean_squared_error(self, dataset, thetas = None):
        if thetas is None:
            thetas = self.thetas   
        squared_error = np.square(self.error(dataset, thetas))
        return (np.mean(squared_error))


    def compute_gradients(self, dataset):
        error = self.error(dataset)
        gradients = np.dot(error, normalize(dataset.x))
        gradients = gradients / len(dataset.y)
        return (gradients)


    def update_thetas(self, dataset):
        self.old_thetas = self.thetas
        self.thetas = self.thetas - (self.lr * self.compute_gradients(dataset))

    
    def update_costs(self, dataset):
        tmpold = self.oldcost
        self.oldcost = self.newcost
        self.newcost = self.mean_squared_error(dataset)
        return tmpold


    def undo_update_costs(self, tmpold):
        self.newcost =self.oldcost
        self.oldcost = tmpold


    def training_loop(self, dataset):
        keep_learning = True
        self.newcost = self.mean_squared_error(dataset)
        while (keep_learning):
            tmpold = self.oldcost
            self.update_thetas(dataset)
            tmpold = self.update_costs(dataset)
            if (self.newcost > self.oldcost):
                self.lr = self.lr * self.lr_decay
                self.thetas = self.old_thetas
                self.undo_update_costs(tmpold)
            keep_learning = not self.should_i_stop(dataset)
            print(self)


    def middle_error(self, dataset):
        middle_thetas = (self.thetas + self.old_thetas) / 2
        return self.mean_squared_error(dataset, middle_thetas)

    
    def should_i_stop(self, dataset):
        if abs(self.oldcost - self.newcost) > self.mininal_improvement:
            return False
        if abs(self.middle_error(dataset) - self.newcost) > self.mininal_improvement:
            return False
        return True


    def __str__(self):
        return (f"Cost: {self.newcost:.2e}, Thetas: {self.thetas}, LR {self.lr:4.2E}")


In [249]:
d = Dataset("data.csv")
shaman = Shaman(d)
shaman.training_loop(d)
# shaman.predict(d.x[0])
# shaman.predict(d.x)
# print(shaman)
# shaman.update_thetas(d)
# print(shaman)
# shaman.compute_gradients(d)

8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.85E-02], LR 2.44E-04
Cost: 8.82e+06, Thetas: [2.13E+03 2.8

KeyboardInterrupt: 

In [115]:
ones = np.ones([2], dtype = float)
ones[0] = 2
twos = ones * 2

In [195]:
normalize(d.x)
# d.x / d.x.sum(axis=0)[np.newaxis, :]

array([[0.04166667, 0.098945  ],
       [0.04166667, 0.05763546],
       [0.04166667, 0.06204676],
       [0.04166667, 0.07648861],
       [0.04166667, 0.07255967],
       [0.04166667, 0.04732869],
       [0.04166667, 0.06876677],
       [0.04166667, 0.0366921 ],
       [0.04166667, 0.05957313],
       [0.04166667, 0.03463075],
       [0.04166667, 0.03381816],
       [0.04166667, 0.0259978 ],
       [0.04166667, 0.03050804],
       [0.04166667, 0.04019641],
       [0.04166667, 0.02762215],
       [0.04166667, 0.03134289],
       [0.04166667, 0.01988588],
       [0.04166667, 0.03834119],
       [0.04166667, 0.02512749],
       [0.04166667, 0.02707547],
       [0.04166667, 0.02226262],
       [0.04166667, 0.02824055],
       [0.04166667, 0.00944059],
       [0.04166667, 0.0254738 ]])

In [138]:
e = np.array([1,2,3,4])
inputs = np.array([[1,2,3], [2,2,2], [3,3,3], [4,4,4]])

In [139]:
print(e)
print("\n")
print(inputs)

[1 2 3 4]


[[1 2 3]
 [2 2 2]
 [3 3 3]
 [4 4 4]]


In [140]:
np.dot(e, inputs)

array([30, 31, 32])

In [141]:
print(d.x)

[[     1 240000]
 [     1 139800]
 [     1 150500]
 [     1 185530]
 [     1 176000]
 [     1 114800]
 [     1 166800]
 [     1  89000]
 [     1 144500]
 [     1  84000]
 [     1  82029]
 [     1  63060]
 [     1  74000]
 [     1  97500]
 [     1  67000]
 [     1  76025]
 [     1  48235]
 [     1  93000]
 [     1  60949]
 [     1  65674]
 [     1  54000]
 [     1  68500]
 [     1  22899]
 [     1  61789]]
