In [1]:
import os
# Check if we are in the notebooks folder, if so, move up
if os.getcwd().endswith("notebooks"):
    os.chdir("..")

print("Current Working Directory:", os.getcwd())

Current Working Directory: c:\Users\PC\Regression_ML_EndtoEnd


In [2]:
# ================================================
# 1. Imports
# ================================================
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [3]:
# ================================================
# 2. Load datasets (train + eval)
# ================================================
train_df = pd.read_csv("data/processed/feature_engineered_train.csv")
eval_df  = pd.read_csv("data/processed/feature_engineered_eval.csv")

In [4]:
'''
# ================================================
# 3. Drop high VIF features (both train + eval)
# ================================================
high_vif_features = [
    "median_sale_price" #highest correlation to 'price' => data leakage
]
train_df.drop(columns=high_vif_features, inplace=True)
eval_df.drop(columns=high_vif_features, inplace=True)
'''



In [5]:
# ================================================
# 4. Define target & features
# ================================================
target = "price"
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_eval = eval_df.drop(columns=[target])
y_eval = eval_df[target]

In [6]:
# ================================================
# 5. Standardization (fit on train, transform eval)
# ================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled  = scaler.transform(X_eval)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

# ================================================
# FIX: Handle Missing Values First
# ================================================

# 1. Create an Imputer (fills blanks with the median number)
imputer = SimpleImputer(strategy='median')

# 2. Fill the missing values in your scaled data
X_train_clean = imputer.fit_transform(X_train_scaled)
X_eval_clean = imputer.transform(X_eval_scaled)

# ================================================
# 6. Train & Evaluate Models
# ================================================

# --- Linear Regression ---
lr = LinearRegression()

# Train on the CLEAN data (not the one with NaNs)
lr.fit(X_train_clean, y_train)

# Predict on the CLEAN evaluation data
y_pred_lr = lr.predict(X_eval_clean)

print("Linear Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lr))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lr)))
print(" R²:", r2_score(y_eval, y_pred_lr))



Linear Regression:
 MAE: 53811.93813400812
 RMSE: 121336.13469295985
 R²: 0.8862267031700115


In [8]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

# ================================================
# 1. Ensure Data is Clean (Fill NaNs)
# ================================================
# We create an imputer to fill missing values with the median
imputer = SimpleImputer(strategy='median')
X_train_clean = imputer.fit_transform(X_train_scaled)
X_eval_clean = imputer.transform(X_eval_scaled)

# ================================================
# 2. Train Ridge Regression
# ================================================
ridge = Ridge(alpha=1.0)

# IMPORTANT: Train on X_train_clean (not X_train_scaled)
ridge.fit(X_train_clean, y_train)

# Predict using X_eval_clean
y_pred_ridge = ridge.predict(X_eval_clean)

print("\nRidge Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_ridge))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_ridge)))
print(" R²:", r2_score(y_eval, y_pred_ridge))




Ridge Regression:
 MAE: 53811.114660439474
 RMSE: 121338.0255149869
 R²: 0.8862231572068495


In [9]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

# ================================================
# 1. Ensure Data is Clean (Fill NaNs)
# ================================================
# (If you already ran this for Ridge, you technically don't need to run it again, 
# but it doesn't hurt to be safe!)
imputer = SimpleImputer(strategy='median')
X_train_clean = imputer.fit_transform(X_train_scaled)
X_eval_clean = imputer.transform(X_eval_scaled)

# ================================================
# 2. Train Lasso Regression
# ================================================
lasso = Lasso(alpha=0.1)

# IMPORTANT: Train on X_train_clean
lasso.fit(X_train_clean, y_train)

# Predict using X_eval_clean
y_pred_lasso = lasso.predict(X_eval_clean)

print("\nLasso Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_lasso))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_lasso)))
print(" R²:", r2_score(y_eval, y_pred_lasso))




Lasso Regression:
 MAE: 54117.426071230286
 RMSE: 121604.4782343721
 R²: 0.8857229111303108


  model = cd_fast.enet_coordinate_descent(


In [10]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

# ================================================
# 1. Ensure Data is Clean (Fill NaNs)
# ================================================
# (Again, mostly for safety if you run this cell independently)
imputer = SimpleImputer(strategy='median')
X_train_clean = imputer.fit_transform(X_train_scaled)
X_eval_clean = imputer.transform(X_eval_scaled)

# ================================================
# 2. Train ElasticNet Regression
# ================================================
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)

# IMPORTANT: Train on X_train_clean
elastic.fit(X_train_clean, y_train)

# Predict using X_eval_clean
y_pred_elastic = elastic.predict(X_eval_clean)

print("\nElasticNet Regression:")
print(" MAE:", mean_absolute_error(y_eval, y_pred_elastic))
print(" RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_elastic)))
print(" R²:", r2_score(y_eval, y_pred_elastic))




ElasticNet Regression:
 MAE: 54234.2493206146
 RMSE: 122295.84870428537
 R²: 0.8844197946433394
