In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [68]:
train_data = pd.read_csv("Train.csv")
test_data = pd.read_csv("Test.csv")

# A look at the data

In [69]:
print("This is the training data")
train_data.head()

This is the training data


Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,Tata Tiago 1.2 Revotron XZ WO Alloy,Hyderabad,2017,16500,Petrol,Manual,First,23.84 kmpl,1199 CC,84 bhp,5.0,5.0
1,1,Mahindra TUV 300 2015-2019 mHAWK100 T8 AMT,Kochi,2017,47357,Diesel,Automatic,First,18.49 kmpl,1493 CC,100 bhp,7.0,8.37
2,2,Skoda Rapid 2013-2016 1.6 MPI Ambition,Mumbai,2014,35000,Petrol,Manual,First,15.0 kmpl,1598 CC,103.52 bhp,5.0,4.5
3,3,Tata Indica V2 DLS BSII,Jaipur,2007,200000,Diesel,Manual,Second,17.2 kmpl,1396 CC,53.5 bhp,5.0,0.85
4,4,Tata Manza Club Class Quadrajet90 LX,Mumbai,2013,64000,Diesel,Manual,Second,21.02 kmpl,1248 CC,88.76 bhp,5.0,2.65


## The following function converts a string containing a number into a floating-point number.

In [None]:
def extract(value):
    try:
        return float(value.split()[0])
    except:
        return np.nan

This code applies the extract fn on the data columns and removes any rows with missing values across columns

In [71]:
for i in ['Mileage', 'Engine', 'Power']:
    train_data[i] = train_data[i].apply(extract)
train_data = train_data.dropna()

This code stores the specific data columns as inputs and Price as output

In [72]:
xf = ['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats']
X_train = train_data[xf].values
y_train = train_data['Price'].values

The following normalizes the inputs of the model between 0 and 1

In [73]:
X_train_min = X_train.min(axis=0)
X_train_max = X_train.max(axis=0)
X_train = (X_train - X_train_min) / (X_train_max - X_train_min)
# X_test = (X_test - X_train_min) / (X_train_max - X_train_min) # rescaled using the same values as train model for consistency

This code initializes values for a linear regression model

In [74]:
#seed helps in generating the same random values each time ensuring reproducibility in code
np.random.seed(0)   
weights = np.random.rand(X_train.shape[1]) # assigning weights to every input helps you find how much each input contributes to the ouptut
bias = 0.0 # gets updated during the training
learning_rate = 0.01 # determines how much weights change during training
num_iterations = 10000 # specifies the number of iterations
m = len(y_train) # stores the number of training examples. Training example is a single data point from the training dataset which is used to teach the model 

Gradient Descent code to train the linear regression model

In [75]:
# Gradient Descent
for i in range(num_iterations): # runs for a pre-defined "num_iterations" times
    y_pred = np.dot(X_train, weights) + bias # calculates the predicted output for each training example
    error = y_pred - y_train # calculates the difference between the predicted values and the actual target

    weights_gradient = (2/m) * np.dot(X_train.T, error) # gradient tells us the direction and magnitude to adjust each weight to reduce the error.
    bias_gradient = (2/m) * np.sum(error) # tells how much to change the bais to improve predictions

    weights -= learning_rate * weights_gradient 
    bias -= learning_rate * bias_gradient   

y_train_pred = np.dot(X_train, weights) + bias

Deals with any NaN values in the predicted and the actual outputs

In [76]:
if np.isnan(y_train).any() or np.isnan(y_train_pred).any(): 
    y_train = np.nan_to_num(y_train, nan=np.nanmean(y_train)) 
    y_train_pred = np.nan_to_num(y_train_pred, nan=np.nanmean(y_train_pred))

Calculates different evaluation metrics for regression models

In [None]:
mae = np.mean(np.abs(y_train-y_train_pred))
mse = np.mean((y_train-y_train_pred) ** 2)
rmse = np.sqrt(mse)

Calculates the R-squared value for checking a linear regression model

In [78]:
y_mean = np.mean(y_train)
total = np.sum((y_train-y_mean)**2)
residual = np.sum((y_train-y_train_pred)**2)
r2 = 1-(residual / total)

In [79]:
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²):", r2)

Mean Absolute Error (MAE): 3.98800048268129
Mean Squared Error (MSE): 42.927333871838535
Root Mean Squared Error (RMSE): 6.551895441155829
R-squared (R²): 0.6623040709526522
