In [26]:
import pandas as pd



#Quick head on the dataset to help my own visualization and understanding of borrowed code.
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,108,392.5
0,19,46.2
1,13,15.7
2,124,422.2
3,40,119.4
4,57,170.9


In [27]:
# Simple Linear Regression on the Swedish Insurance Dataset
#imports to read a csv, cal square root, and make random seeds for the test_train_split
from random import seed
from random import randrange
from csv import reader
from math import sqrt

# Load a CSV file
# analogous to pandas.read_csv() that we've all used hundreds of times
#reads in in line by line
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
# Less abstract version of reading a pandas df column as a data type
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Split a dataset into a train and test set
#equivalent to train_test_split method from scikit learn we've all used
#This is what the random seed generator imports are for
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

# Calculate root mean squared error
#Needed for the y=b0+b1x+e calculation's metric. Covered this in class
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

# Evaluate an algorithm using a train/test split
#Does exactly what it says. no need to comment more
def evaluate_algorithm(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    rmse = rmse_metric(actual, predicted)
    return rmse

# Calculate the mean value of a list of numbers'
#Needed for mse, obvious calculation
def mean(values):
    return sum(values) / float(len(values))

# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

# Calculate the variance of a list of numbers
#Note the distinction of variance vs covariance
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

# Calculate coefficients
#Coefficients, aka b0 and b1
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

# Simple linear regression algorithm
#Time for the guest of honor, actually calculates y=b0+b1x+e
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

# Simple linear regression on insurance dataset
seed(1)
# load and prepare data
filename =  "insurance.csv"
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    #note the nested for loops because reads in then does another for loop here, simple code but could be improved.
    #Abstract scikit does it more efficiently typically
    str_column_to_float(dataset, i)
# evaluate algorithm
split = 0.6
rmse = evaluate_algorithm(dataset, simple_linear_regression, split)
#Remember RMSE is Relative Mean Squared Error, the main metric for accuracy of linear regression
print('RMSE: %.3f' % (rmse))

RMSE: 33.630
