In [1]:
## Baseline Models - random prediction algorithm, zero rule prediction algorithm
from random import seed
from random import randrange
from csv import reader
from math import sqrt

# Generate random predictions
def random_algorithm(train, test):
    output_values = [row[-1] for row in train]
    unique = list(set(output_values))
    predicted = list()
    for _ in test:
        index = randrange(len(unique))
        predicted.append(unique[index])
    return predicted
    
seed(1)
train = [[0], [1], [0], [1], [0], [1]]
test = [[None],[None],[None],[None]]
predictions = random_algorithm(train, test)
print(predictions)



[0, 0, 1, 0]


In [2]:
# zero rule algo.
def zero_rule_algo_classification(train, test):
    output_value = [row[-1] for row in train]
    prediction = max(set(output_value), key=output_value.count)
    print (prediction)
    predicted = [prediction for i in range(len(test))]
    return predicted

result = zero_rule_algo_classification(train, test)
print(result)

0
[0, 0, 0, 0]


In [3]:
# training & test dataset split
def train_test_split(dataset, split=0.60):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

In [4]:
# Caculate accuracy precentage between two lists
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100

In [5]:
# Evaluate an algo. using traing/test data split
def evaluate_algorithm(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    print(predicted)
    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse

In [6]:
#calculate mean value of list of members
def mean(values):
    return sum(values) / float(len(values))

dataset = [[1,1], [2,3],[4,3],[3,2],[5,5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
m_x = mean(x)
m_y = mean(y)
print(m_x, m_y)



3.0 2.8


In [7]:
# calculate variance of list of members
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

v_x, v_y = variance(x, m_x), variance(y, m_y)
print(v_x, v_y)

10.0 8.8


In [8]:
# calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

covar = covariance(x, m_x, y, m_y)
print(covar)


8.0


In [9]:
# Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        print(predicted[i], actual[i])
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)


In [10]:
# calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean, = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]
b0, b1 = coefficients(dataset)
print(b0, b1)

0.39999999999999947 0.8


In [11]:
# make a prediction
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        print('yhat %.3f' %yhat)
        predictions.append(yhat)
    return predictions

dataset = [[1,1], [2,3],[4,3],[3,2],[5,5]]
rmse = evaluate_algorithm(dataset, simple_linear_regression)
print('%.3f' % rmse)
print(rmse)


yhat 1.200
yhat 2.000
yhat 3.600
yhat 2.800
yhat 4.400
[1.1999999999999995, 1.9999999999999996, 3.5999999999999996, 2.8, 4.3999999999999995]
1.1999999999999995 1
1.9999999999999996 3
3.5999999999999996 3
2.8 2
4.3999999999999995 5
0.693
0.692820323027551
