Load and Inspect the Dataset

In [4]:
import pandas as pd
import numpy as np

# Load dataset
data = pd.read_csv("multiple_linear_regression_dataset.csv")

print(data.head())
print(data.columns)
print(data.shape)


   age  experience  income
0   25           1   30450
1   30           3   35670
2   47           2   31580
3   32           5   40130
4   43          10   47830
Index(['age', 'experience', 'income'], dtype='object')
(20, 3)


In [None]:
# Inputs: Age, Experience
# Output: Income
# Number of features = 2

 Separate Inputs and Output

In [3]:
# Inputs (features)
X = data[["age","experience"]]

# Output (target)
y = data["income"]

In [12]:
#shape of x and y
print(X.shape)
print(y.shape)

(20, 2)
(20,)


 Initialize the Model Parameters

In [5]:
n_features = X.shape[1]
w = np.zeros(n_features)
b = 0.0

In [None]:
# Why need one weight per feature?-Each feature affects the output differently, so each needs its own weight.
# Why is bias separate from weights?- Bias shifts the prediction and allows output even when inputs are zero.
# Would initializing with large values be risky?- Yes, it can cause unstable learning and divergence.

Forward Pass (Prediction)

In [6]:
def predict(X, w, b):
    y_hat = X.dot(w) + b
    return y_hat

In [None]:
# Why is there no activation function?- Regression needs continuous outputs, so no activation is required.
# What kind of values can yÌ‚ take?- any real value.
# How is this different from logistic regression?- Logistic regression outputs probabilities, while linear regression outputs real values.

 Loss Function (MSE)

In [7]:
def mean_squared_error(y, y_hat):
    loss = ((y_hat - y) ** 2).mean()
    return loss

In [None]:
# Why square the error?- To penalize large errors and keep the loss differentiable.
# What happens if one prediction is very wrong?- Its loss becomes very large, strongly affecting training.
# Why not just take absolute error?- Because it is not differentiable at zero.

Compute Gradients

In [8]:
def compute_gradients(X, y, y_hat):
    N = len(y)

    dw = (2 / N) * X.T.dot(y_hat - y)
    db = (2 / N) * (y_hat - y).sum()

    return dw, db


In [None]:
# Why does X appear in dw but not in db?- Weights depend on inputs; bias does not.
# Why does the error term appear everywhere?- Because learning aims to reduce prediction error.
# What happens if error is zero?- Gradients become zero and learning stops.

Update Parameters (Gradient Descent)

In [9]:
def update_parameters(w, b, dw, db, lr):
    w = w - lr * dw
    b = b - lr * db
    return w, b

In [None]:
# Does loss decrease over time?- Yes, if learning is correct.
# What happens if loss increases?- Learning rate or gradients are wrong.
# How do learning rate and epochs interact?- Lower LR needs more epochs; higher LR is faster but unstable.

 Training Loop

In [10]:
lr = 0.0001
epochs = 1000

for epoch in range(epochs):
    y_hat = predict(X, w, b)
    loss = mean_squared_error(y, y_hat)
    dw, db = compute_gradients(X, y, y_hat)
    w, b = update_parameters(w, b, dw, db, lr)

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss}")


Epoch 0, Loss: 1727049635.0
Epoch 100, Loss: 66491868.55311352
Epoch 200, Loss: 61752567.201190114
Epoch 300, Loss: 58616531.07847049
Epoch 400, Loss: 56528801.53951118
Epoch 500, Loss: 55126542.02946697
Epoch 600, Loss: 54172526.94885703
Epoch 700, Loss: 53511656.14292054
Epoch 800, Loss: 53042523.72795741
Epoch 900, Loss: 52698829.56325033


Final Evaluation

In [11]:
print("Final Weights:", w)
print("Final Bias:", b)

# Predict for new candidate
new_candidate = np.array([4.5, 68])
predicted_salary = new_candidate.dot(w) + b
print("Predicted Salary:", predicted_salary)


Final Weights: age            764.754059
experience    1371.034304
dtype: float64
Final Bias: 321.73641174472493
Predicted Salary: 96993.4623777421


In [None]:
# Is the prediction reasonable?- Yes, it matches the data trend.
# Does it interpolate smoothly?- Yes, predictions are continuous.
# Why is this better than threshold rules?- It learns from data and generalizes better.