# Implementing Linear Regression

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 
  
# metadata 
auto_mpg.metadata

In [None]:
# variable information 
auto_mpg.variables


In [None]:
X.shape

In [None]:
X.head(5)

In [None]:
y.head(3)

In [None]:
import matplotlib.pyplot as plt
# plt.hist(X['horsepower'])

# plt.hist(X['origin'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
                                   X, y, test_size=0.2, random_state=42)
from sklearn.impute import SimpleImputer

# Imputer for numeric columns (e.g., Age)
imputer = SimpleImputer(strategy='median')  # or 'mean' if you prefer
feat = 'horsepower'
# Fit on train, transform train
X_train[feat] = imputer.fit_transform(X_train[[feat]])

# Transform test using same statistics
X_test[feat] = imputer.transform(X_test[[feat]])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1️⃣ Initialize the model
model = LinearRegression()

# 2️⃣ Train the model
model.fit(X_train_scaled, y_train)

# 3️⃣ Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# 4️⃣ Evaluate performance
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Train MSE: {train_mse:.4f}")
print(f"Test MSE:  {test_mse:.4f}")
print(f"R² Score (test): {r2:.4f}")

# 5️⃣ (Optional) Inspect coefficients
print("\nModel Coefficients:")
print(model.coef_)
print("Intercept:", model.intercept_)


## Naive Numpy implementation

In [None]:
#XtX = 
t = np.expand_dims(X_train_scaled, axis=1)

# check if it is invertible
# do matrix multiplications

In [None]:
t.shape

In [None]:
X_train_scaled.shape

In [None]:
import numpy as np

# Suppose X is your original feature matrix (n_samples × n_features)
# and y is your target vector (n_samples × 1)

# 1️⃣ Add a column of 1s for the bias/intercept term
X_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]   # shape becomes (n_samples, n_features + 1)

y_train = y_train.to_numpy()
# 2️⃣ Compute the Normal Equation
theta_best = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_train

# 3️⃣ Optional: Separate intercept and coefficients
intercept = theta_best[0]
coefficients = theta_best[1:]

print("Intercept:", intercept)
print("Coefficients:", coefficients)


In [None]:
print(coefficients.shape)

In [None]:
print(X_train_scaled.shape)

In [None]:
a = X_test_scaled[0,:]

def predict_one(x, coefficients, intercept):
    return np.dot(x,coefficients) + intercept

predict_one(a,coefficients, intercept)

def predict_all(X, coefficients, intercept):
    n_examples = X.shape[0]
    return np.array([predict_one(x, coefficients, intercept) for x in X])

y_train_pred_s = predict_all(X_train_scaled, coefficients, intercept)

In [None]:
y_train_pred_s[:5,:]

In [None]:
y_test_pred_s = predict_all(X_test_scaled, coefficients, intercept)

In [None]:
# 4️⃣ Evaluate performance
train_mse_s = mean_squared_error(y_train, y_train_pred_s)
test_mse_s = mean_squared_error(y_test, y_test_pred)
r2_test_s = r2_score(y_test, y_test_pred)
r2_train_s = r2_score(y_train, y_train_pred)

In [None]:
print(f"Train MSE: {train_mse_s:.4f}")
print(f"Test MSE:  {test_mse_s:.4f}")
print(f"R² Score (train): {r2_train_s:.4f}")
print(f"R² Score (test): {r2_test_s:.4f}")
