In [3]:
# Refrence : https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/

In [4]:
# importing required library
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

In [5]:
# Calculate the mean value of the list of numbers
def mean(values):
    return sum(values) / float(len(values))

# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

# calculate mean and variance
data = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]

X = [row[0] for row in data]
y = [row[1] for row in data]

mean_x, mean_y = mean(X), mean(y)
var_x, var_y = variance(X, mean_x), variance(y, mean_y)

print('x stats: mean=%.3f variance=%.3f' % (mean_x, var_x))
print('y stats: mean=%.3f variance=%.3f' % (mean_y, var_y))

x stats: mean=3.000 variance=10.000
y stats: mean=2.800 variance=8.800


In [6]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [7]:

covar = covariance(X, mean_x, y, mean_y)
print('Covariance: %.3f' % (covar))

Covariance: 8.000


In [8]:
 
# Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]
 
# calculate coefficients
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
b0, b1 = coefficients(dataset)
print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

Coefficients: B0=0.400, B1=0.800


In [9]:
# Make prediction
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        y_hat = b0 + b1 * row[0]
        predictions.append(y_hat)
    return predictions

In [10]:
# Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += pow(prediction_error, 2)
        mean_error = sum_error / float(len(actual))
    return np.sqrt(mean_error)

In [11]:
# Evaluate regression algorithm on training dataset
def evaluate_algorithm(dataset, algorithm):
	test_set = list()
	for row in dataset:
		row_copy = list(row)
		row_copy[-1] = None
		test_set.append(row_copy)
	predicted = algorithm(dataset, test_set)
	print(predicted)
	actual = [row[-1] for row in dataset]
	rmse = rmse_metric(actual, predicted)
	return rmse

In [12]:
# Test simple linear regression
rmse = evaluate_algorithm(data, simple_linear_regression)
print('RMSE: %.3f' % (rmse))

[1.1999999999999995, 1.9999999999999996, 3.5999999999999996, 2.8, 4.3999999999999995]
RMSE: 0.693


In [13]:
class SimpleLinearRegression:
    def __init__(self):
        self.b0 = None
        self.b1 = None
    
    #Calculating the mean of data
    def mean(self, data):
        return sum(data)/float(len(data))
    
    # Calculating the variance of data
    def variance(self, data, mean):
        return sum([(x-mean)**2 for x in data])
    
    # Calculating co-variance of data
    def covariance(self, x, x_mean, y, y_mean):
        cov = 0
        for i in range(len(x)):
            cov += (x[i] - x_mean) * (y[i] - y_mean)
        return cov
    
    def _intercepts(self):
        return self.b0
    
    def _coefficeint(self):
        return self.b1
    
    def fit(self, dataset):
        X = [row[0] for row in dataset]
        y = [row[1] for row in dataset]
        x_mean, y_mean = self.mean(X), self.mean(y)
        self.b1 = self.covariance(X, x_mean, y, y_mean) / self.variance(X, x_mean)
        self.b0 = self.mean(y) - b1*self.mean(X)
    
    def predict(self, test_data):
        predected_data = []
        for row in test_data:
            predected_value = self.b0 + self.b1*row[0]
            predected_data.append(predected_value)
        return predected_data

In [14]:
# calculate mean and variance
data = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]

X = [row[0] for row in data]
y = [row[1] for row in data]

regression = SimpleLinearRegression()

regression.fit(data)
y_pred = regression.predict(data)
y_pred

[1.1999999999999995,
 1.9999999999999996,
 3.5999999999999996,
 2.8,
 4.3999999999999995]

In [15]:
regression._intercepts(), regression._coefficeint()

(0.39999999999999947, 0.8)

In [16]:
dataset = pd.read_csv("insurance.csv")
dataset.head()

Unnamed: 0,Claims,Amount
0,19,46.2
1,13,15.7
2,40,119.4
3,57,170.9
4,23,56.9


In [21]:
# Implementing mean absolute error
def mean_absolute_error(actual_data, predicted_data):
    total_absolute_error = np.sum(np.abs(actual_data - predicted_data))
    error_mean_value = total_absolute_error / len(actual_data)
    return error_mean_value

In [22]:
mae = mean_absolute_error(np.asarray(y), np.asarray(y_pred))
mae

0.64

In [26]:
# Implementing mean absolute percentage error
def mean_absolute_percentage_error(actual, predicted):
    abs_error = (np.abs(actual - predicted)) / actual
    sum_abs_error = np.sum(abs_error)
    mape_loss = (sum_abs_error / actual.size) * 100
    return mape_loss

In [29]:
mape = mean_absolute_percentage_error(np.asarray(y), np.asarray(y_pred))
mape

25.06666666666666

In [34]:
# Implementing mean squared error
def mean_squared_error(actual, predicted):
    squared_error = np.square(actual - predicted) 
    sum_squared_error = np.sum(squared_error)
    mse_loss = sum_squared_error / actual.size
    return mse_loss

In [35]:
mse = mean_squared_error(np.asarray(y), np.asarray(y_pred))
mse

0.48000000000000015

In [36]:
def r_squared(y_true, y_pred):
    """
    Returns the R-squared value of the predicted and true target variables.
    
    Parameters:
    y_true: numpy array of true target variables.
    y_pred: numpy array of predicted target variables.
    
    Returns:
    R-squared value as a float.
    """
    ss_res = np.sum(np.square(y_true - y_pred))
    ss_tot = np.sum(np.square(y_true - np.mean(y_true)))
    r_squared = 1 - (ss_res / ss_tot)
    
    return r_squared



In [39]:
r2 = r_squared(np.asarray(y), np.asarray(y_pred))
r2

0.7272727272727272

In [40]:

def adj_r_squared(y_true, y_pred, n_features):
    """
    Returns the adjusted R-squared value of the predicted and true target variables.
    
    Parameters:
    y_true: numpy array of true target variables.
    y_pred: numpy array of predicted target variables.
    n_features: int, number of features used in the model.
    
    Returns:
    Adjusted R-squared value as a float.
    """
    r_squared_value = r_squared(y_true, y_pred)
    n = len(y_true)
    adj_r_squared = 1 - ((1 - r_squared_value) * ((n - 1) / (n - n_features - 1)))
    
    return adj_r_squared

In [41]:
adj_r2 = adj_r_squared(np.asarray(y), np.asarray(y_pred), 1)
adj_r2

0.6363636363636362

In [63]:
import random
def train_test_split(X, y, test_size=0.2, random_state=None):
    """
    Splits the data into training and testing sets.
    
    Parameters:
    X: numpy array or pandas dataframe of features.
    y: numpy array or pandas dataframe of target variable.
    test_size: float, represents the proportion of data to be used for testing.
    random_state: int or None, represents the seed value for random number generation.
    
    Returns:
    X_train: numpy array or pandas dataframe of training features.
    X_test: numpy array or pandas dataframe of testing features.
    y_train: numpy array or pandas dataframe of training target variable.
    y_test: numpy array or pandas dataframe of testing target variable.
    """
    
    
    return X_train, X_test, y_train, y_test

In [64]:
X_train, X_test, y_train, y_test = train_test_split(np.asarray(dataset["Claims"]), np.asarray(dataset["Amount"]), 
                                                    test_size=0.2, random_state=42)

In [65]:
X_train.shape, X_test.shape

((60,), ())