In [319]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import warnings 
warnings.filterwarnings('ignore')

In [320]:
df = pd.read_csv("auto-mpg.csv")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [321]:
df.drop(columns=['car name'], axis = 1, inplace=True)
df.drop(df.index[[32, 126, 330, 336, 354, 374]], inplace=True)

In [322]:
mask = df.isin(['?'])
# Get the row and column indices where '?' is located
locations = list(zip(*mask.to_numpy().nonzero()))
print("Locations of '?':", locations)

# Optionally, get the actual row and column labels
for row, col in locations:
    print(f"'?' found at row {row}, column {df.columns[col]}")

Locations of '?': []


In [323]:
df.shape

(392, 8)

In [324]:
X = df.drop(columns=['mpg'], axis = 1)
y = df[['mpg']]
X.shape

(392, 7)

In [325]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Computing the parameters and r2 score using Linear Regression 

In [326]:
lr = LinearRegression()

lr.fit(X_train, y_train)

In [327]:
print(lr.coef_)
print(lr.intercept_)

[[-0.34578883  0.01510871 -0.02130175 -0.00614163  0.03795001  0.76774258
   1.61345707]]
[-18.49936113]


In [328]:
y_pred = lr.predict(X_test)

In [329]:
r2_score(y_test, y_pred)


0.7901500386760352

## Computing the parameters and r2 score using Stochastic Gradient Descent 

In [330]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [331]:
sgd = SGDRegressor(max_iter=1000, learning_rate='constant', eta0=0.01)

sgd.fit(X_train_scaled,y_train)

In [332]:
print(sgd.coef_)
print(sgd.intercept_)

[-0.55673277  1.37971155 -0.76384233 -5.19369349 -0.04782318  2.87399098
  1.08704554]
[23.56188479]


In [333]:
# Recover original coefficients
coef_original = sgd.coef_ / scaler.scale_
intercept_original = sgd.intercept_ - sum(coef_original * scaler.mean_)
print(coef_original)
print(intercept_original)

[-0.3279282   0.01331759 -0.01998412 -0.00618452 -0.0169986   0.79297106
  1.34890571]
[-18.95114725]


In [334]:
y_pred = sgd.predict(X_test_scaled)

In [335]:
r2_score(y_test, y_pred)

0.7926624586283864