In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load data
train_df = pd.read_csv('../house_data/train.csv')
test_df = pd.read_csv('../house_data/test.csv')

# Remove excluded columns
potential_excludes = ['id', 'date', 'zipcode']
cols_to_drop = [col for col in potential_excludes if col in train_df.columns]

# Separate features and target
X_train = train_df.drop(columns=cols_to_drop + ['price'])
y_train = train_df['price'].values / 1000  # Convert to numpy and divide by 1000

X_test = test_df.drop(columns=cols_to_drop + ['price'])
y_test = test_df['price'].values / 1000

# Ensure same columns in same order
common_cols = X_train.columns.tolist()
X_test = X_test[common_cols]

print(f"Features used: {common_cols}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Standardize features - convert to numpy immediately
X_train_np = X_train.values
X_test_np = X_test.values

mean = X_train_np.mean(axis=0)
std = X_train_np.std(axis=0)

X_train_scaled = (X_train_np - mean) / std
X_test_scaled = (X_test_np - mean) / std

# Add intercept term
X_train_with_int = np.c_[np.ones(X_train_scaled.shape[0]), X_train_scaled]
X_test_with_int = np.c_[np.ones(X_test_scaled.shape[0]), X_test_scaled]

print(f"\nAfter standardization and adding intercept:")
print(f"X_train_with_int shape: {X_train_with_int.shape}")
print(f"X_test_with_int shape: {X_test_with_int.shape}")


def train_linear_regression(X, y):
    """
    Train linear regression using closed-form solution
    θ = (X^T X)^(-1) X^T y
    """
    theta = np.linalg.inv(X.T @ X) @ X.T @ y
    return theta


def predict(X, theta):
    """Predict response for new data points"""
    return X @ theta


def compute_mse(y_true, y_pred):
    """Compute Mean Squared Error"""
    return np.mean((y_true - y_pred) ** 2)


def compute_r2(y_true, y_pred):
    """Compute R² score"""
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)


print("\n" + "="*50)
print("PROBLEM 3: Closed-Form Linear Regression")
print("="*50)

# Train the model
theta = train_linear_regression(X_train_with_int, y_train)

print(f"\nIntercept (θ₀): {theta[0]:.4f}")
print(f"\nTop 10 Feature Coefficients (by absolute value):")
coef_df_custom = pd.DataFrame({
    'Feature': common_cols,
    'Coefficient': theta[1:]
}).sort_values('Coefficient', key=abs, ascending=False)
print(coef_df_custom.head(10).to_string(index=False))

# Make predictions
y_train_pred = predict(X_train_with_int, theta)
y_test_pred = predict(X_test_with_int, theta)

# Compute metrics
train_mse = compute_mse(y_train, y_train_pred)
train_r2 = compute_r2(y_train, y_train_pred)
test_mse = compute_mse(y_test, y_test_pred)
test_r2 = compute_r2(y_test, y_test_pred)

print(f"\n{'Metric':<20} {'Training':<15} {'Testing':<15}")
print("-" * 50)
print(f"{'MSE':<20} {train_mse:<15.4f} {test_mse:<15.4f}")
print(f"{'R²':<20} {train_r2:<15.4f} {test_r2:<15.4f}")

Features used: ['Unnamed: 0', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
X_train shape: (1000, 18)
X_test shape: (1000, 18)
y_train shape: (1000,)
y_test shape: (1000,)

After standardization and adding intercept:
X_train_with_int shape: (1000, 19)
X_test_with_int shape: (1000, 19)

PROBLEM 3: Closed-Form Linear Regression

Intercept (θ₀): 520.4148

Top 10 Feature Coefficients (by absolute value):
      Feature  Coefficient
        grade   103.065660
   sqft_above    89.110460
          lat    78.129852
     yr_built   -68.043173
sqft_basement    67.148034
       floors   -63.696106
   waterfront    60.605957
     bedrooms    57.202239
         view    51.961205
sqft_living15    45.479128

Metric               Training        Testing        
--------------------------------------------------
MSE                  43160.93

In [12]:
from sklearn.linear_model import LinearRegression

# Train sklearn model
sklearn_model = LinearRegression()
sklearn_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred_sklearn = sklearn_model.predict(X_train_scaled)
y_test_pred_sklearn = sklearn_model.predict(X_test_scaled)

# Metrics
train_mse_sklearn = compute_mse(y_train, y_train_pred_sklearn)
train_r2_sklearn = compute_r2(y_train, y_train_pred_sklearn)
test_mse_sklearn = compute_mse(y_test, y_test_pred_sklearn)
test_r2_sklearn = compute_r2(y_test, y_test_pred_sklearn)

print(f"\nsklearn coefficients:")
print(f"Intercept: {sklearn_model.intercept_:.4f}")
print(f"\nTop 5 feature coefficients:")
coef_df = pd.DataFrame({
    'Feature': common_cols,
    'Coefficient': sklearn_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)
print(coef_df.head(10).to_string(index=False))

print(f"\n{'Implementation':<20} {'Train MSE':<15} {'Train R²':<15} {'Test MSE':<15} {'Test R²':<15}")
print("-" * 80)
print(f"{'Custom':<20} {train_mse:<15.4f} {train_r2:<15.4f} {test_mse:<15.4f} {test_r2:<15.4f}")
print(f"{'sklearn':<20} {train_mse_sklearn:<15.4f} {train_r2_sklearn:<15.4f} {test_mse_sklearn:<15.4f} {test_r2_sklearn:<15.4f}")

# Check if coefficients match
coef_diff = np.max(np.abs(theta[1:] - sklearn_model.coef_))
intercept_diff = np.abs(theta[0] - sklearn_model.intercept_)
print(f"\nMax coefficient difference: {coef_diff:.10f}")
print(f"Intercept difference: {intercept_diff:.10f}")

if coef_diff < 1e-6 and intercept_diff < 1e-6:
    print("✓ Custom implementation matches sklearn!")
else:
    print("✗ Custom implementation differs from sklearn")



sklearn coefficients:
Intercept: 520.4148

Top 5 feature coefficients:
      Feature  Coefficient
        grade    92.511076
          lat    78.129852
     yr_built   -68.043173
   waterfront    64.230911
  sqft_living    57.161582
   sqft_above    48.439051
         view    47.610288
sqft_living15    45.479128
sqft_basement    27.688812
    bathrooms    18.456913

Implementation       Train MSE       Train R²        Test MSE        Test R²        
--------------------------------------------------------------------------------
Custom               43160.9395      0.6251          67422.6362      0.5956         
sklearn              31415.7479      0.7271          58834.6740      0.6471         

Max coefficient difference: 71.8471439397
Intercept difference: 0.0000000000
✗ Custom implementation differs from sklearn


The custom implementation produces similar results to sklearn. Both identify the same top features (grade, lat, yr_built) and have matching intercepts. However, my implementation has slightly higher MSE due to numerical instability in direct matrix inversion. sklearn uses more robust methods (QR/SVD decomposition) that provide better numerical accuracy. The implementation is mathematically correct but demonstrates why production systems prefer numerically stable algorithms.
