# Simple Linear Regression from Scratch
A step-by-step implementation of linear regression using the normal equation without relying on scikit-learn.  
This project includes:
- Matrix operations (transpose, multiplication)
- Normal equation calculation
- Model prediction
- Error evaluation with RMSE and R²
- Train-test split evaluation


## Setup and Imports
Import necessary libraries and modules used in the notebook.


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import math


In [1]:
# Finding transpose of the independent variable X.
def transpose_of_matrix(x):
    rows_x = len(x) 
    cols_x = len(x[0])
    transpose_x = [[0 for _ in range(rows_x)] for _ in range(cols_x)]

    for i in range(rows_x):
        for j in range(cols_x):
            transpose_x[j][i] = x[i][j]
    
    return transpose_x

# Finding product of two matrices.
def matix_product(x,y):

    # matrix x dimensions.
    rows_x = len(x)
    cols_x = len(x[0])

    # matrix y dimensions.
    rows_y = len(y)
    cols_y = len(y[0])

    if cols_x != rows_y:
        raise ValueError("Rows of 1st matrix must be equal to columns of 2nd matrix for multiplication.")
    
    # Initialization of product matrix. 
    product_matrix = [[0 for _ in range(cols_y)] for _ in range(rows_x)]

    # multiplication Logic.
    for i in range(rows_x):
        for j in range(cols_y):
            for k in range(cols_x):
                product_matrix[i][j] += x[i][k] * y[k][j]

    return product_matrix


## Adds an intercept term (a column of 1's) to the feature matrix X.
 This allows the linear regression model to learn the intercept (bias) coefficient.

 For example, if X is:
   [[x11],
    [x21],
    [x31]]

 After adding intercept, it becomes:
   [[1, x11],
    [1, x21],
    [1, x31]]

 This transforms the feature matrix to include the bias term β0 in the model:
   y = β0*1 + β1*x1

In [20]:
def add_intercept(X):
    return [[1] + row for row in X]


## Normal Equation
Calculate the beta coefficients using the normal equation: β = (XᵀX)^(-1)Xᵀy


In [None]:
import numpy as np

def normal_equation(X, y):
    
    """
    Computes beta using the normal equation:
    β = (X^T X)^(-1) X^T y
    """

    # Ensure y is a column vector
    if isinstance(y[0], (int, float)):
        y = [[value] for value in y]
    
    # Transpose of X
    X_T = transpose_of_matrix(X)

    # Gram matrix: X^T X
    gram_matrix = matix_product(X_T, X)

    # Inverse of Gram matrix
    gram_matrix_inv = np.linalg.inv(gram_matrix)

    # X^T y
    X_T_y = matix_product(X_T, y)

    # Beta coefficients: (X^T X)^(-1) X^T y
    beta = matix_product(gram_matrix_inv, X_T_y)

    return beta    

## Prediction Function
Using the beta coefficients to predict target values for input features.


In [13]:
def predict(x, y):
    # Get learned coefficients (beta vector) from normal equation
    beta = normal_equation(x, y)  
    
    predictions = []
    
    # For each sample in X
    for row in x:
        # Dot product of features and coefficients
        pred = 0
        for feature, coef in zip(row, [b[0] for b in beta]):  # beta is a column vector
            pred += feature * coef
        predictions.append(pred)
    
    return predictions

## Error Metrics
Calculate Root Mean Squared Error (RMSE) and R-squared (R²) to evaluate model performance.


In [14]:
import math

def error_rmse(prediction, y):
    # Sum of squared errors
    sse = 0
    for i in range(len(y)):
        sse += (y[i] - prediction[i]) ** 2
    
    # Mean squared error
    mse = sse / len(y)
    
    # Root mean squared error
    rmse = math.sqrt(mse)
    return rmse


## Data Loading and Preprocessing
Load the dataset and prepare feature and target variables.


In [15]:
dataframe = pd.read_csv('Salary_dataset.csv')

x = dataframe[['YearsExperience']].values.tolist()
y = dataframe['Salary'].values.tolist()



## Train-Test Split
Split data into training and testing sets for model validation.


In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



## Model Training
Calculate beta coefficients using the training data.


In [21]:
x_train_intercept = add_intercept(x_train)
beta = normal_equation(x_train_intercept, y_train)



## Prediction and Evaluation
Predict on test data and evaluate model performance using RMSE and R².


In [22]:
x_test_intercept = add_intercept(x_test)
prediction = predict(x_test_intercept, beta)

r2 = r2_score(y_test, prediction)
rmse = error_rmse(prediction, y_test)

print("R² score on test set:", r2)
print("RMSE on test set:", rmse)



R² score on test set: 0.9024461774180498
RMSE on test set: 7059.043621901507


## Conclusion
- Summarize results and model performance.
- Discuss potential improvements or next steps, such as:
  - Adding more features
  - Implementing gradient descent
  - Comparing with scikit-learn’s LinearRegression


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
import math

# -------- Matrix Operations --------
def transpose_of_matrix(x):
    rows_x = len(x) 
    cols_x = len(x[0])
    transpose_x = [[0 for _ in range(rows_x)] for _ in range(cols_x)]
    for i in range(rows_x):
        for j in range(cols_x):
            transpose_x[j][i] = x[i][j]
    return transpose_x

def matrix_product(x, y):
    rows_x = len(x)
    cols_x = len(x[0])
    rows_y = len(y)
    cols_y = len(y[0])

    if cols_x != rows_y:
        raise ValueError("Rows of 1st matrix must be equal to columns of 2nd matrix for multiplication.")
    
    product_matrix = [[0 for _ in range(cols_y)] for _ in range(rows_x)]

    for i in range(rows_x):
        for j in range(cols_y):
            for k in range(cols_x):
                product_matrix[i][j] += x[i][k] * y[k][j]

    return product_matrix

# -------- Add intercept term (bias) --------
def add_intercept(X):
    return [[1] + row for row in X]

# -------- Normal Equation --------
def normal_equation(X, y):
    # y to column vector if needed
    if isinstance(y[0], (int, float)):
        y = [[value] for value in y]

    X_T = transpose_of_matrix(X)
    gram_matrix = matrix_product(X_T, X)
    gram_matrix_np = np.array(gram_matrix)
    gram_matrix_inv = np.linalg.inv(gram_matrix_np).tolist()
    X_T_y = matrix_product(X_T, y)
    beta = matrix_product(gram_matrix_inv, X_T_y)
    return beta  # list of lists shape (n_features, 1)

# -------- Prediction --------
def predict(X, beta):
    predictions = []
    beta_flat = [b[0] for b in beta]  # flatten beta
    for row in X:
        pred = 0
        for feature, coef in zip(row, beta_flat):
            pred += feature * coef
        predictions.append(pred)
    return predictions

# -------- Error metrics --------
def error_rmse(prediction, y):
    sse = 0
    for i in range(len(y)):
        sse += (prediction[i] - y[i]) ** 2
    mse = sse / len(y)
    rmse = math.sqrt(mse)
    return rmse

# -------- Main workflow --------
if __name__ == "__main__":
    dataframe = pd.read_csv('Salary_dataset.csv')

    x = dataframe[['YearsExperience']].values.tolist()
    y = dataframe['Salary'].values.tolist()

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # Add intercept term
    x_train_intercept = add_intercept(x_train)
    x_test_intercept = add_intercept(x_test)

    # Get beta coefficients from training data
    beta = normal_equation(x_train_intercept, y_train)

    # Predict on test data
    prediction = predict(x_test_intercept, beta)

    # Evaluate model
    r2 = r2_score(y_test, prediction)
    rmse = error_rmse(prediction, y_test)

    print("Beta coefficients:")
    for i, b in enumerate(beta):
        if i == 0:
            print(f"Intercept: {b[0]}")
        else:
            print(f"Coefficient for feature {i}: {b[0]}")

    print(f"\nR² score on test set: {r2:.4f}")
    print(f"RMSE on test set: {rmse:.4f}")


Beta coefficients:
Intercept: 24380.20147947379
Coefficient for feature 1: 9423.815323030962

R² score on test set: 0.9024
RMSE on test set: 7059.0436
