In [11]:
import csv
import os

In [8]:
def loadData(fileName, inputVariabName, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable = dataNames.index(inputVariabName)
    inputs = [float(data[i][selectedVariable]) if data[i][selectedVariable] != '' else None for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [float(data[i][selectedOutput]) if data[i][selectedOutput] != '' else None for i in range(len(data))]

    inputs, outputs = zip(*[(i, o) for i, o in zip(inputs, outputs) if i is not None and o is not None])

    return inputs, outputs


<h3>Propriu - family </h3>

In [32]:
import random

crtDir =  os.getcwd()
filePath = os.path.join(crtDir, 'data', 'v3_world-happiness-report-2017.csv')

inputs, outputs = loadData(filePath, 'Family', 'Happiness.Score')

#impartirea datelor pe train(80%) si validation (20%)

random.seed(5)  # Set the seed for reproducibility
indexes = [i for i in range(len(inputs))]
trainSample = random.sample(indexes, int(0.8 * len(inputs)))
validationSample = [i for i in indexes if i not in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
validationInputs = [inputs[i] for i in validationSample]
validationOutputs = [outputs[i] for i in validationSample]


#antrenam modelul
n = len(trainInputs)
sx = sum(trainInputs)
sy = sum(trainOutputs)
sxy = sum(i * j for i, j in zip(trainInputs, trainOutputs))
sxx = sum(i ** 2 for i in trainInputs)

w1 = (n * sxy - sx * sy) / (n * sxx - sx ** 2)
w0 = (sy - w1 * sx) / n

print('the learnt model: f(x) = ', w0, ' + ', w1, ' * x')

#makes predictions for test data, calcul metrici de performanta
computedValidationOutputs = [w0 + w1 * x for x in validationInputs]

error = 0.0
for t1, t2 in zip(computedValidationOutputs, validationOutputs):
    error += (t1 - t2) ** 2
error = error / len(validationOutputs)
print("prediction error (manual): ", error)


the learnt model: f(x) =  1.808267394166918  +  2.9583363045673554  * x
prediction error (manual):  0.696960935019105


<h3> Propriu - PIB </h3>


In [33]:
import random

crtDir =  os.getcwd()
filePath = os.path.join(crtDir, 'data', 'v3_world-happiness-report-2017.csv')

inputs, outputs = loadData(filePath, 'Economy..GDP.per.Capita.', 'Happiness.Score')

#impartirea datelor pe train si validation
random.seed(5)
indexes = [i for i in range(len(inputs))]
trainSample = random.sample(indexes, int(0.8 * len(inputs)))
validationSample = [i for i in indexes if i not in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
validationInputs = [inputs[i] for i in validationSample]
validationOutputs = [outputs[i] for i in validationSample]

#antrenam modelul
n = len(trainInputs)
sx = sum(trainInputs)
sy = sum(trainOutputs)
sxy = sum(i * j for i, j in zip(trainInputs, trainOutputs))
sxx = sum(i ** 2 for i in trainInputs)

w1 = (n * sxy - sx * sy) / (n * sxx - sx ** 2)
w0 = (sy - w1 * sx) / n

print('the learnt model: f(x) = ', w0, ' + ', w1, ' * x')

#makes predictions for test data, calcul metrici de performanta
computedValidationOutputs = [w0 + w1 * x for x in validationInputs]

error = 0.0
for t1, t2 in zip(computedValidationOutputs, validationOutputs):
    error += (t1 - t2) ** 2
error = error / len(validationOutputs)
print("prediction error (manual): ", error)


the learnt model: f(x) =  3.2432322927841613  +  2.1475159197540834  * x
prediction error (manual):  0.4661599001167347


<h3> Prorpiu - PIB si Freedom </h3>

In [34]:

def loadData2(fileName, inputVariabName1, inputVariabName2, outputVariabName):
    data = []
    dataNames = []
    with open(fileName) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                dataNames = row
            else:
                data.append(row)
            line_count += 1
    selectedVariable1 = dataNames.index(inputVariabName1)
    inputs1 = [float(data[i][selectedVariable1]) if data[i][selectedVariable1] != '' else None for i in range(len(data))]
    selectedVariable2 = dataNames.index(inputVariabName2)
    inputs2 = [float(data[i][selectedVariable2]) if data[i][selectedVariable2] != '' else None for i in range(len(data))]
    selectedOutput = dataNames.index(outputVariabName)
    outputs = [float(data[i][selectedOutput]) if data[i][selectedOutput] != '' else None for i in range(len(data))]

    inputs1, inputs2, outputs = zip(*[(i1, i2, o) for i1, i2, o in zip(inputs1, inputs2, outputs) if i1 is not None and i2 is not None and o is not None])

    return list(inputs1), list(inputs2), list(outputs)



In [38]:
def add_ones_column(matrix):
    # adauga o coloana de 1 la inceputul matricei pentru a include termenul de interceptare (w0)
    return [[1] + list(row) for row in matrix]

def transpose(matrix):
    return list(map(list, zip(*matrix)))

def dot_product(a, b):
    return sum(x * y for x, y in zip(a, b))

def matrix_multiply(a, b):
    return [[dot_product(row, col) for col in transpose(b)] for row in a]

def inverse(matrix):
    n = len(matrix)
    identity = [[float(i == j) for i in range(n)] for j in range(n)]
    for i in range(n):
        if matrix[i][i] == 0.0:
            for j in range(i + 1, n):
                if matrix[j][i] != 0.0:
                    matrix[i], matrix[j] = matrix[j], matrix[i]
                    identity[i], identity[j] = identity[j], identity[i]
                    break

        pivot = matrix[i][i]
        for j in range(i, n):
            matrix[i][j] /= pivot
        for j in range(n):
            identity[i][j] /= pivot
        for j in range(n):
            if i != j:
                ratio = matrix[j][i]
                for k in range(i, n):
                    matrix[j][k] -= ratio * matrix[i][k]
                for k in range(n):
                    identity[j][k] -= ratio * identity[i][k]
    return identity

def trainModel(trainInputs, trainOutputs):
    X = add_ones_column(trainInputs)
    y = trainOutputs

    # transpusa lui X
    Xt = transpose(X)
    # produsul lui Xt și X
    XtX = matrix_multiply(Xt, X)
    # inversa lui XtX
    XtX_inv = inverse(XtX)
    # produsul dintre inversa lui XtX și Xt
    XtX_inv_Xt = matrix_multiply(XtX_inv, Xt)

    # calc w folosind formula (Xt * X)^(-1) * Xt * y
    w = [dot_product(row, y) for row in XtX_inv_Xt]

    return w


In [42]:
inputVariabNames = ['Economy..GDP.per.Capita.', 'Freedom']
outputVariabName = 'Happiness.Score'
crtDir =  os.getcwd()
filePath = os.path.join(crtDir, 'data', 'v1_world-happiness-report-2017.csv')

inputGDP, inputFreedom, outputs = loadData2(filePath, inputVariabNames[0], inputVariabNames[1],outputVariabName)

# split data into training data (80%) and testing data (20%)
random.seed(5)
indexes = [i for i in range(len(inputs))]
trainSample = random.sample(indexes, int(0.8 * len(inputs)))
validationSample = [i for i in indexes if i not in trainSample]

#cream setul de training
trainInputsGDP = [inputGDP[i] for i in trainSample]
trainInputsFreedom = [inputFreedom[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]

# cream setul de validare
validationInputsGDP = [inputGDP[i] for i in validationSample]
validationInputsFreedom = [inputFreedom[i] for i in validationSample]
validationOutputs = [outputs[i] for i in validationSample]

trainInputs = list(zip(trainInputsGDP, trainInputsFreedom))
validationInputs = list(zip(validationInputsGDP, validationInputsFreedom))

#antrenam modelul
w = trainModel(trainInputs, trainOutputs)
print('the learnt model: f(x) = ', w[0], ' + ', w[1], ' * x1 + ', w[2], ' * x2')

#X = add_ones_column(validationInputs)
X = [[1] + list(row) for row in validationInputs]

computedValidationOutputs = [sum(x * y for x, y in zip(row, w)) for row in X]

error = sum((t1 - t2) ** 2 for t1, t2 in zip(computedValidationOutputs, validationOutputs)) / len(validationOutputs)
print('error: ', error)


the learnt model: f(x) =  2.6328794321899767  +  1.7743842047694993  * x1 +  2.3801762253152527  * x2
error:  0.33518060140920786
