Funkcja regresji liniowej

Z modułu math pobierana jest pakiet sqrt,który zwraca ułamek kwadratowy danej liczby.
reader -> zwraca obiekt writer, który będzie iterował po liniach w podanym pliku csv. 
Każdy wiersz odczytany z pliku csv jest zwracany jako lista ciągów.

In [11]:
from csv import reader
from math import sqrt
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

#konwertuje dane na liczby
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())
def mean(values):
    return sum(values) / float(len(values))
 
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

# błąd sredniokwadratowy
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)
 

def evaluate_algorithm(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    print(predicted)
    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse

def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions


filename='insurance.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
    
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
mean_x, mean_y = mean(x), mean(y)
var_x, var_y = variance(x, mean_x), variance(y, mean_y)
covar = covariance(x, mean_x, y, mean_y)
b0, b1 = coefficients(dataset)
rmse = evaluate_algorithm(dataset, simple_linear_regression)
print(f'x stats: mean={mean_x}, variance={var_x}')
print(f'y stats: mean={mean_y}, variance={var_y}')
print('Covariance:',covar)
print(f'Coefficients: b0={b0}, b1={b1}')
print('RMSE:',rmse)

[375.3173278822568, 72.02206541291909, 429.84231888798035, 143.58611610793136, 201.5189190515127, 85.65331316435, 54.983005723630455, 160.62517579721998, 41.35175797219955]
x stats: mean=48.888888888888886, variance=13588.888888888889
y stats: mean=173.8777777777778, variance=165109.31555555554
Covariance: 46308.37777777777
Coefficients: b0=7.27363859362228, b1=3.4078119378577267
RMSE: 28.478204027366786


Kod z użyciem biblioteki numpy

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


data = pd.read_csv('insurance.csv')

X = data['x'].values
Y = data['y'].values

mean_x = np.mean(X)
mean_y = np.mean(Y)

m = len(X)

numer = 0
denom = 0
for i in range(m):
    numer += (X[i] - mean_x) * (Y[i] - mean_y)
    denom += (X[i] - mean_x) ** 2
b1 = numer / denom
b0 = mean_y - (b1 * mean_x)
print(f'b0:{b0}, b1:{b1}')

rmse = 0
for i in range(m):
    y_pred = b0 + b1 * X[i]
    rmse += (Y[i] - y_pred) ** 2
rmse = np.sqrt(rmse/m)
print('RMSE:',rmse)

b0:7.273638593622252, b1:3.4078119378577267
RMSE: 28.478204027366786
