# Implementing Linear Regression

In [15]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 
  
# metadata 
auto_mpg.metadata

{'uci_id': 9,
 'name': 'Auto MPG',
 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg',
 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv',
 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption',
 'area': 'Other',
 'tasks': ['Regression'],
 'characteristics': ['Multivariate'],
 'num_instances': 398,
 'num_features': 7,
 'feature_types': ['Real', 'Categorical', 'Integer'],
 'demographics': [],
 'target_col': ['mpg'],
 'index_col': ['car_name'],
 'has_missing_values': 'yes',
 'missing_values_symbol': 'NaN',
 'year_of_dataset_creation': 1993,
 'last_updated': 'Thu Aug 10 2023',
 'dataset_doi': '10.24432/C5859H',
 'creators': ['R. Quinlan'],
 'intro_paper': None,
 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had 

In [16]:
# variable information 
auto_mpg.variables


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,displacement,Feature,Continuous,,,,no
1,mpg,Target,Continuous,,,,no
2,cylinders,Feature,Integer,,,,no
3,horsepower,Feature,Continuous,,,,yes
4,weight,Feature,Continuous,,,,no
5,acceleration,Feature,Continuous,,,,no
6,model_year,Feature,Integer,,,,no
7,origin,Feature,Integer,,,,no
8,car_name,ID,Categorical,,,,no


In [17]:
X.shape

(398, 7)

In [22]:
X.head(5)

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
0,307.0,8,130.0,3504,12.0,70,1
1,350.0,8,165.0,3693,11.5,70,1
2,318.0,8,150.0,3436,11.0,70,1
3,304.0,8,150.0,3433,12.0,70,1
4,302.0,8,140.0,3449,10.5,70,1


In [24]:
y.head(3)

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0


In [None]:
import matplotlib.pyplot as plt
# plt.hist(X['horsepower'])

# plt.hist(X['origin'])

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
                                   X, y, test_size=0.2, random_state=42)
from sklearn.impute import SimpleImputer

# Imputer for numeric columns (e.g., Age)
imputer = SimpleImputer(strategy='median')  # or 'mean' if you prefer
feat = 'horsepower'
# Fit on train, transform train
X_train[feat] = imputer.fit_transform(X_train[[feat]])

# Transform test using same statistics
X_test[feat] = imputer.transform(X_test[[feat]])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1️⃣ Initialize the model
model = LinearRegression()

# 2️⃣ Train the model
model.fit(X_train_scaled, y_train)

# 3️⃣ Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# 4️⃣ Evaluate performance
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Train MSE: {train_mse:.4f}")
print(f"Test MSE:  {test_mse:.4f}")
print(f"R² Score (test): {r2:.4f}")

# 5️⃣ (Optional) Inspect coefficients
print("\nModel Coefficients:")
print(model.coef_)
print("Intercept:", model.intercept_)


Train MSE: 11.6598
Test MSE:  8.1982
R² Score (test): 0.8475

Model Coefficients:
[[ 1.46510599 -0.26427258 -0.4949466  -5.65910982  0.18758242  2.859148
   1.07320745]]
Intercept: [23.6081761]


## Naive Numpy implementation

In [35]:
#XtX = 
t = np.expand_dims(X_train_scaled, axis=1)

# check if it is invertible
# do matrix multiplications

In [36]:
t.shape

(318, 1, 7)

In [37]:
X_train_scaled.shape

(318, 7)

In [39]:
import numpy as np

# Suppose X is your original feature matrix (n_samples × n_features)
# and y is your target vector (n_samples × 1)

# 1️⃣ Add a column of 1s for the bias/intercept term
X_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]   # shape becomes (n_samples, n_features + 1)

y_train = y_train.to_numpy()
# 2️⃣ Compute the Normal Equation
theta_best = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_train

# 3️⃣ Optional: Separate intercept and coefficients
intercept = theta_best[0]
coefficients = theta_best[1:]

print("Intercept:", intercept)
print("Coefficients:", coefficients)


Intercept: [23.6081761]
Coefficients: [[ 1.46510599]
 [-0.26427258]
 [-0.4949466 ]
 [-5.65910982]
 [ 0.18758242]
 [ 2.859148  ]
 [ 1.07320745]]


In [41]:
print(coefficients.shape)

(7, 1)


In [43]:
print(X_train_scaled.shape)

(318, 7)


In [44]:
a = X_test_scaled[0,:]

def predict_one(x, coefficients, intercept):
    return np.dot(x,coefficients) + intercept

predict_one(a,coefficients, intercept)

def predict_all(X, coefficients, intercept):
    n_examples = X.shape[0]
    return np.array([predict_one(x, coefficients, intercept) for x in X])

y_train_pred_s = predict_all(X_train_scaled, coefficients, intercept)

In [46]:
y_train_pred_s[:5,:]

array([[15.1652094 ],
       [25.27688518],
       [35.955334  ],
       [34.27794257],
       [24.32045547]])

In [47]:
y_test_pred_s = predict_all(X_test_scaled, coefficients, intercept)

In [48]:
# 4️⃣ Evaluate performance
train_mse_s = mean_squared_error(y_train, y_train_pred_s)
test_mse_s = mean_squared_error(y_test, y_test_pred)
r2_test_s = r2_score(y_test, y_test_pred)
r2_train_s = r2_score(y_train, y_train_pred)

In [49]:
print(f"Train MSE: {train_mse_s:.4f}")
print(f"Test MSE:  {test_mse_s:.4f}")
print(f"R² Score (train): {r2_train_s:.4f}")
print(f"R² Score (test): {r2_test_s:.4f}")


Train MSE: 11.6598
Test MSE:  8.1982
R² Score (train): 0.8140
R² Score (test): 0.8475
