In [1]:
import sys
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt
from random import seed
from random import randrange

"""
    Linear Regression : 
        B1 = covariance(x,y) / variance(x)
        B0 = mean(y) - B1*mean(x)
    Equation:
        y= B0 +B1*x
"""
#Calculate the mean value of a list of numbers
def mean(values):
    return( sum(values) / float(len(values)) )

#Calculate the variance of a list of numbers
def variance(values,mean_val):
    result=0
    for x in values:
        result+= (x-mean_val)**2
    return result
    
#Calculate the covariance from two groups of numbers
#Covariance --> describes how those numbers change together
def covariance(values_x,values_y,mean_x,mean_y):

    if(len(values_x)!= len(values_y)):
        print("Groups must have same dimensions.")
        sys.exit()
    result=0
    for i in range(len(values_x)):
        result+= (values_x[i]-mean_x) * (values_y[i]-mean_y)
    return result

#Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    
    mean_x, mean_y = mean(x), mean(y)

    b1=covariance(x,y,mean_x,mean_y) / variance(x,mean_x)
    b0=mean(y) - b1 * mean(x)

    return [b0,b1]

#Calculate the root mean square error
def rmse_metric(y,predictions):
    sum_error=0.0

    if(len(y)!= len(predictions)):
        print("Groups must have same dimensions.")
        sys.exit()
    for i in range(0,len(y)):
        pred_err = y[i] - predictions[i]
        sum_error+= pred_err**2
    mean_err = sum_error / float(len(y))
    return sqrt(mean_err)

#Linear regression algorithm and testing
def simple_linear_regression(train,test):
    predictions = list()
    b0,b1=coefficients(train)

    for i in test:
        yPred = b0 + b1 *i
        predictions.append(yPred)
    return predictions   

def evaluate_algorithm(dataset, split):
    train, test = train_test_split(dataset, split)
    test_set = list()
    
    for row in test:
        test_set.append(row[0])

    predicted = simple_linear_regression(train,test_set)
    actual = [row[-1] for row in test]

    plt.scatter(test_set,actual,color='green')
    plt.plot(test_set,predicted,color='red')
    plt.show()

    rmse = rmse_metric(actual, predicted)
    return rmse

# Split a dataset into a train and test set
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return [train, dataset_copy]


"""
    Dataset:
        http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset
"""
seed(2) #for randrange
data=pd.read_csv('day.csv')
dataset=[]

xVariable='windspeed'
yVariable='temp'

for i in range(len(data)):
    dataset.append([data[xVariable][i],data[yVariable][i]])


split = 0.8
rmse = evaluate_algorithm(dataset,split)
print('RMSE: %.3f' % (rmse))

<Figure size 640x480 with 1 Axes>

RMSE: 0.186
