In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv("combined_B_&_C.csv")

In [3]:
print(df.head())

         date      price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  \
0  2014-05-02   313000.0       3.0       1.50         1340      7912     1.5   
1  2014-05-02  2384000.0       5.0       2.50         3650      9050     2.0   
2  2014-05-02   342000.0       3.0       2.00         1930     11947     1.0   
3  2014-05-02   420000.0       3.0       2.25         2000      8030     1.0   
4  2014-05-02   550000.0       4.0       2.50         1940     10500     1.0   

   waterfront  view  condition  ...  city_grouped_SeaTac  \
0           0     0          3  ...                False   
1           0     4          5  ...                False   
2           0     0          4  ...                False   
3           0     0          4  ...                False   
4           0     0          4  ...                False   

   city_grouped_Seattle  city_grouped_Shoreline  city_grouped_Snoqualmie  \
0                 False                    True                    False   
1     

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9200 entries, 0 to 9199
Data columns (total 79 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   date                           9200 non-null   object 
 1   price                          9200 non-null   float64
 2   bedrooms                       9200 non-null   float64
 3   bathrooms                      9200 non-null   float64
 4   sqft_living                    9200 non-null   int64  
 5   sqft_lot                       9200 non-null   int64  
 6   floors                         9200 non-null   float64
 7   waterfront                     9200 non-null   int64  
 8   view                           9200 non-null   int64  
 9   condition                      9200 non-null   int64  
 10  sqft_above                     9200 non-null   int64  
 11  sqft_basement                  9200 non-null   int64  
 12  yr_built                       9200 non-null   i

In [5]:
# Replace infinite values with NaN in the entire DataFrame
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
# 3. Define target and features
# We use log_price as the target and remove both 'price' and 'log_price' from the features
target_col = 'log_price'
df.dropna(subset=[target_col], inplace=True)  # ensure no missing target values

In [7]:
# 2. Data Cleaning
# Drop non-numeric columns that are not needed for regression
cols_to_drop = ['date', 'street', 'city', 'statezip', 'country', 'bedrooms_bin', 'bathrooms_bin', 'yr_built_bin']
df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

In [9]:
# 3. Definefeatures

X = df.drop(columns=['price', 'log_price'], errors='ignore')
y = df[target_col]

In [10]:
if 'yr_renovated_bin' in X.columns:
    X.drop(columns=['yr_renovated_bin'], inplace=True)

In [11]:
# One-hot encode the floors_bin column if it exists (since it has values like '1', '2', '3+')
if 'floors_bin' in df.columns:
    df = pd.get_dummies(df, columns=['floors_bin'], drop_first=True)

In [12]:
# Convert boolean columns to integers (0/1)
df = df.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)

In [15]:
# Optional sanity check
print(X.select_dtypes(exclude='number').columns)

Index([], dtype='object')


In [14]:
# Just in case, Ensure X contains only numeric columns (drop any remaining non-numeric features)
X = X.select_dtypes(include=[np.number]).copy()

In [16]:
# 4. Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# 5. Setup Cross-Validation and scoring metrics
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
# Using sklearn's built-in scorers for MAE and RMSE.
scoring = {
    'MAE': 'neg_mean_absolute_error', 
    'RMSE': 'neg_root_mean_squared_error'
}

In [19]:
from sklearn.impute import SimpleImputer
# 6. Build Pipelines for the three models

# Baseline: OLS (Linear Regression)
ols_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('reg', LinearRegression())
])

# Ridge Regression with built-in cross-validation for alpha tuning
ridge_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('reg', RidgeCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5))
])

# Lasso Regression with built-in cross-validation for alpha tuning
lasso_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('reg', LassoCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5, random_state=42, max_iter=10000))
])

In [20]:
# 7. Evaluation function: performs CV and computes test set errors
def evaluate_model(pipeline, X_train, y_train, X_test, y_test, scoring, cv):
    # Cross-validation on training set
    cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring, return_train_score=True)
    
    # Fit the pipeline on the full training set
    pipeline.fit(X_train, y_train)
    # Predict on the test set
    y_pred_test = pipeline.predict(X_test)
    
    # Compute test errors
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Convert negative scores to positive values
    train_mae = -np.mean(cv_results['train_MAE'])
    train_rmse = -np.mean(cv_results['train_RMSE'])
    cv_mae = -np.mean(cv_results['test_MAE'])
    cv_rmse = -np.mean(cv_results['test_RMSE'])
    
    return {
        'Train MAE': train_mae,
        'Train RMSE': train_rmse,
        'CV MAE': cv_mae,
        'CV RMSE': cv_rmse,
        'Test MAE': test_mae,
        'Test RMSE': test_rmse
    }

In [21]:
# 8. Evaluate each model
results_ols   = evaluate_model(ols_pipeline,   X_train, y_train, X_test, y_test, scoring, cv)
results_ridge = evaluate_model(ridge_pipeline, X_train, y_train, X_test, y_test, scoring, cv)
results_lasso = evaluate_model(lasso_pipeline, X_train, y_train, X_test, y_test, scoring, cv)

In [25]:
# 9. Compile results into a summary table
results_df = pd.DataFrame({
    'Model': ['OLS', 'Ridge', 'Lasso'],
    'Train MAE': [results_ols['Train MAE'], results_ridge['Train MAE'], results_lasso['Train MAE']],
    'CV MAE': [results_ols['CV MAE'], results_ridge['CV MAE'], results_lasso['CV MAE']],
    'Test MAE': [results_ols['Test MAE'], results_ridge['Test MAE'], results_lasso['Test MAE']],
    'Train RMSE': [results_ols['Train RMSE'], results_ridge['Train RMSE'], results_lasso['Train RMSE']],
    'CV RMSE': [results_ols['CV RMSE'], results_ridge['CV RMSE'], results_lasso['CV RMSE']],
    'Test RMSE': [results_ols['Test RMSE'], results_ridge['Test RMSE'], results_lasso['Test RMSE']]
})

print("Model Evaluation Results:\n", results_df)

Model Evaluation Results:
    Model  Train MAE    CV MAE  Test MAE  Train RMSE   CV RMSE  Test RMSE
0    OLS   0.159973  0.164084  0.172419    0.238016  0.361477   0.259211
1  Ridge   0.160866  0.164578  0.172815    0.238457  0.357001   0.258614
2  Lasso   0.209395  0.213556  0.233379    0.291195  0.420187   0.306902


In [26]:
# 10. Save the evaluation table to a CSV file
results_df.to_csv('linear_model_evaluation.csv', index=False)
print("Evaluation results saved to 'linear_model_evaluation.csv'")

Evaluation results saved to 'linear_model_evaluation.csv'


In [27]:
from sklearn.metrics import r2_score

# Predict on the test set for each model using the already fitted pipelines
y_pred_ols   = ols_pipeline.predict(X_test)
y_pred_ridge = ridge_pipeline.predict(X_test)
y_pred_lasso = lasso_pipeline.predict(X_test)

# Calculate R^2 score for each model
r2_ols   = r2_score(y_test, y_pred_ols)
r2_ridge = r2_score(y_test, y_pred_ridge)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Print the previously calculated evaluation metrics along with the R^2 score
print("Detailed Model Evaluation Results:")
print("OLS Results:")
print(f"  Train MAE: {results_ols['Train MAE']:.4f}")
print(f"  Train RMSE: {results_ols['Train RMSE']:.4f}")
print(f"  CV MAE: {results_ols['CV MAE']:.4f}")
print(f"  CV RMSE: {results_ols['CV RMSE']:.4f}")
print(f"  Test MAE: {results_ols['Test MAE']:.4f}")
print(f"  Test RMSE: {results_ols['Test RMSE']:.4f}")
print(f"  Test R^2: {r2_ols:.4f}\n")

print("Ridge Results:")
print(f"  Train MAE: {results_ridge['Train MAE']:.4f}")
print(f"  Train RMSE: {results_ridge['Train RMSE']:.4f}")
print(f"  CV MAE: {results_ridge['CV MAE']:.4f}")
print(f"  CV RMSE: {results_ridge['CV RMSE']:.4f}")
print(f"  Test MAE: {results_ridge['Test MAE']:.4f}")
print(f"  Test RMSE: {results_ridge['Test RMSE']:.4f}")
print(f"  Test R^2: {r2_ridge:.4f}\n")

print("Lasso Results:")
print(f"  Train MAE: {results_lasso['Train MAE']:.4f}")
print(f"  Train RMSE: {results_lasso['Train RMSE']:.4f}")
print(f"  CV MAE: {results_lasso['CV MAE']:.4f}")
print(f"  CV RMSE: {results_lasso['CV RMSE']:.4f}")
print(f"  Test MAE: {results_lasso['Test MAE']:.4f}")
print(f"  Test RMSE: {results_lasso['Test RMSE']:.4f}")
print(f"  Test R^2: {r2_lasso:.4f}")


Detailed Model Evaluation Results:
OLS Results:
  Train MAE: 0.1600
  Train RMSE: 0.2380
  CV MAE: 0.1641
  CV RMSE: 0.3615
  Test MAE: 0.1724
  Test RMSE: 0.2592
  Test R^2: 0.7624

Ridge Results:
  Train MAE: 0.1609
  Train RMSE: 0.2385
  CV MAE: 0.1646
  CV RMSE: 0.3570
  Test MAE: 0.1728
  Test RMSE: 0.2586
  Test R^2: 0.7635

Lasso Results:
  Train MAE: 0.2094
  Train RMSE: 0.2912
  CV MAE: 0.2136
  CV RMSE: 0.4202
  Test MAE: 0.2334
  Test RMSE: 0.3069
  Test R^2: 0.6669


OLS better

In [30]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 9102 entries, 0 to 9199
Data columns (total 72 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   price                          9102 non-null   float64
 1   bedrooms                       9102 non-null   float64
 2   bathrooms                      9102 non-null   float64
 3   sqft_living                    9102 non-null   int64  
 4   sqft_lot                       9102 non-null   int64  
 5   floors                         9102 non-null   float64
 6   waterfront                     9102 non-null   int64  
 7   view                           9102 non-null   int64  
 8   condition                      9102 non-null   int64  
 9   sqft_above                     9102 non-null   int64  
 10  sqft_basement                  9102 non-null   int64  
 11  yr_built                       9102 non-null   int64  
 12  yr_renovated                   9102 non-null   int64 