In [59]:
# Importing the necessary libraries

import pandas as pd 
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [60]:
cars=pd.read_csv("C:\\Users\\HARSH\\OneDrive\\Desktop\\Biz_Machine_Learning\\cars.csv")
cars

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [61]:
# Cleaning data

In [62]:
cars.isnull().sum()

Unnamed: 0    0
mpg           0
cyl           0
disp          0
hp            0
drat          0
wt            0
qsec          0
vs            0
am            0
gear          0
carb          0
dtype: int64

In [63]:
cars.duplicated().sum()

0

In [64]:
# Detecting outliers using IQR

numeric_features = ['disp', 'hp', 'drat', 'wt', 'qsec', 'mpg']

def detect_outliers_iqr(data, feature):
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = data[(data[feature] < lower) | (data[feature] > upper)]
    return outliers

for feature in numeric_features:
    outliers = detect_outliers_iqr(cars, feature)
    print(f"{feature}: {len(outliers)} outlier(s)")

disp: 0 outlier(s)
hp: 1 outlier(s)
drat: 0 outlier(s)
wt: 3 outlier(s)
qsec: 1 outlier(s)
mpg: 1 outlier(s)


In [65]:
# Removing outliers

def remove_outliers_iqr(data, features):
    cleaned_data = data.copy()
    for feature in features:
        Q1 = cleaned_data[feature].quantile(0.25)
        Q3 = cleaned_data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        cleaned_data = cleaned_data[(cleaned_data[feature] >= lower) & (cleaned_data[feature] <= upper)]
    return cleaned_data

# Define numeric features
numeric_features = ['disp', 'hp', 'drat', 'wt', 'qsec', 'mpg']

# Remove outliers from the DataFrame
cars_cleaned = remove_outliers_iqr(cars, numeric_features)

# Check result
print("Original shape:", cars.shape)
print("Cleaned shape:", cars_cleaned.shape)

Original shape: (32, 12)
Cleaned shape: (27, 12)


In [66]:
# Training and testing the data 

from sklearn.model_selection import train_test_split

X = cars_cleaned.drop(columns=['Unnamed: 0', 'mpg'])  # Features
y = cars_cleaned['mpg']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=35)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (20, 10)
Test set size: (7, 10)


In [67]:
# Using the standardScaler for scaling

from sklearn.preprocessing import StandardScaler

features_to_scale = ['disp', 'hp', 'drat', 'wt', 'qsec']

scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[features_to_scale] = scaler.fit_transform(X[features_to_scale])
X_scaled[features_to_scale]

Unnamed: 0,disp,hp,drat,wt,qsec
0,-0.545591,-0.509393,0.55685,-0.586999,-0.860969
1,-0.545591,-0.509393,0.55685,-0.273414,-0.479214
2,-1.009783,-0.807621,0.46427,-0.955924,0.604698
3,0.329232,-0.509393,-0.961458,0.1447,1.170514
4,1.239763,0.630893,-0.831846,0.421393,-0.479214
5,0.034649,-0.597107,-1.553968,0.445988,1.702244
6,1.239763,1.858894,-0.72075,0.581261,-1.283627
7,-0.664317,-1.35145,0.168015,0.113957,1.552269
9,-0.477747,-0.281336,0.593882,0.421393,0.393369
10,-0.477747,-0.281336,0.593882,0.421393,0.802393


In [68]:
# Using Linear Regression model

model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

Model Coefficients: [ 3.04894616e-02  2.74150083e-02 -3.49156682e-02  1.23823244e-01
 -6.03440908e+00  1.38857677e+00  4.16393249e-03  6.65015273e-01
  1.77303585e+00  1.56361527e-01]
Intercept: 5.189569300514162
Root Mean Squared Error (RMSE): 1.8941470116928787
R² Score: 0.7032579633492875


In [71]:
# Using k-fold cross validation

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X = cars_cleaned.drop(columns=["Unnamed: 0", "mpg"])  # Drop car names and target
y = cars_cleaned["mpg"]

# Converting to NumPy arrays
X = X.values
y = y.values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
r2_scores = []

# Performing K-Fold cross validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    rmse_scores.append(rmse)
    r2_scores.append(r2)

print("Average RMSE:", np.mean(rmse_scores))
print("Average R² Score:", np.mean(r2_scores))

Average RMSE: 3.480458479794155
Average R² Score: 0.2624866667289248
