In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abhishek14398/salary-dataset-simple-linear-regression")

print("Path to dataset files:", path)

In [None]:
import os
files = os.listdir(path)
print("Files in folder:", files)

csv_file = [f for f in files if f.endswith(".csv")][0]  # Get the CSV file
csv_path = os.path.join(path, csv_file)


In [None]:
import pandas as pd
data = pd.read_csv(csv_path)
print(data.head())

In [None]:
import matplotlib.pyplot as plt

def loss_function(m,b, data):
    total_loss = 0
    n = len(data)
    for i in range(n):
        x = data.iloc[i]['YearsExperience']
        y = data.iloc[i]['Salary']
        y_pred = m * x + b # Linear regression line prediction
        total_loss += (y - y_pred) ** 2 # squared error
    return total_loss / n # Mean Squared Error

def gradient_descent(data, learning_rate = 0.01, epochs = 1000):
    m = 0 # slope
    b = 0  # y-intercept
    n = len(data)
    losses = []
    
    for epoch in range(epochs+1):
        m_grad = 0
        b_grad = 0
        
        for i in range(n):
            x = data.iloc[i]['YearsExperience']
            y = data.iloc[i]['Salary']
            y_pred = m * x + b
            
            # Calculate gradients of loss function
            m_grad += (-2/n) * x * (y - y_pred) # calculating gradient w.r.t m
            b_grad += (-2/n) * (y - y_pred) # calculating gradient w.r.t b

        # Update parameters
        m -= learning_rate * m_grad # we subtract because we want to go in the direction of steepest descent
        b -= learning_rate * b_grad # gradient tells us if we increase the parameter a little how does the loss increase (or decrease)
        
        if epoch % 10 == 0:
            current_loss = loss_function(m, b, data)
            losses.append(current_loss)
            print(f"Epoch {epoch}: Loss = {current_loss}, m = {m}, b = {b}")

    plt.plot(range(0,len(losses)), losses, color='red')
    plt.xlabel('Epoch (Ã—10)')
    plt.ylabel('Loss')
    plt.title('Loss over Epochs')
    plt.show()

    return m, b


In [None]:
def linear_regression(data, learning_rate=0.01, epochs=1000):
    m,b = gradient_descent(data,learning_rate,epochs)
    plt.scatter(data['YearsExperience'], data['Salary'], color='blue')
    plt.xlabel('Years of Experience')
    plt.ylabel('Salary')
    plt.title('Experience vs Salary')

    x_vals = data['YearsExperience']
    y_vals = m * x_vals + b
    plt.plot(x_vals, y_vals, color='green')
    plt.show()

linear_regression(data, learning_rate=0.01, epochs=1000)