In [1]:
#Bibliotecas
import csv
import matplotlib.pyplot as plt
from math import sqrt
from random import randrange

In [3]:
def str_column_to_float(data):
    newData = []
    for lines in data:
        aux = [float(x) for x in lines]
        newData.append(aux)
    return newData

def load_csv(filename):
    lines = csv.reader(open(filename + ".csv", "r"))
    data = list(lines)
    return str_column_to_float(data)

def split_train_test(data, test_size=0.4):
    dataTrain = list(data)
    dataTest = []
    size = int(len(dataTrain)*test_size)
    for _ in range(size):
        idx = randrange(len(dataTrain))
        dataTest.append(dataTrain.pop(idx))
    return [dataTrain, dataTest]

def mean(values):
    return sum(values)/len(values)

def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        ypred = b0 + b1 * row[0]
        predictions.append(ypred)
    return predictions

def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

def evaluate_algorithm(train_set, test_set, algorithm):
    predicted = algorithm(train_set, test_set)
    actual = [row[-1] for row in test_set]
    rmse = rmse_metric(actual, predicted)
    return rmse



In [8]:
dataset = load_csv('insurance')
train, test = split_train_test(dataset, 0.4)
predictions = simple_linear_regression(train, test)
rmse = evaluate_algorithm(train, test, simple_linear_regression)
print('\nRMSE:', rmse)


RMSE: 31.985410547621942
