In [None]:
# Ridge Regression for Property Price Prediction
from sklearn.linear_model import Ridge
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from joblib import dump
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
import os
from datetime import datetime

# Load Preprocessed Data
file_path = r'...\data\standardized_data.csv'
df = pd.read_csv(file_path)
X = df.drop(columns=['price'])
y = df['price']

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # Can also use 'median' or 'most_frequent'
X_imputed = imputer.fit_transform(X)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Set up Ridge regression model
model = Ridge()

# Define hyperparameter grid for tuning
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0]  # Adjust these values based on your needs
}

# Set up GridSearchCV for cross-validation and hyperparameter tuning
scorer = make_scorer(mean_squared_error, greater_is_better=False)  # Negate for scoring
grid_search = GridSearchCV(model, param_grid, cv=5, scoring=scorer, verbose=1)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Evaluate Model Performance on Train Set
y_pred_train = best_model.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

# Evaluate Model Performance on Test Set
y_pred_test = best_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

# Print evaluation metrics
print(f'MSE (Train): {mse_train:.4f}, MAE (Train): {mae_train:.4f}, R2 Score (Train): {r2_train:.4f}')
print(f'MSE (Test): {mse_test:.4f}, MAE (Test): {mae_test:.4f}, R2 Score (Test): {r2_test:.4f}')
print(f'Best Hyperparameters: {grid_search.best_params_}')

# Save the model and results
results_path = r'...\results'
os.makedirs(results_path, exist_ok=True)  # Create directory if it doesn't exist

# Save model with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_file = os.path.join(results_path, f'ridge_model_{timestamp}.joblib')
dump(best_model, model_file)

# Save results to CSV with timestamp
results_file = os.path.join(results_path, f'ridge_results_{timestamp}.csv')
pd.DataFrame([{
    'Model': 'Ridge Regression',
    'Train MSE': mse_train,
    'Train MAE': mae_train,
    'Train R2': r2_train,
    'Test MSE': mse_test,
    'Test MAE': mae_test,
    'Test R2': r2_test
}]).to_csv(results_file, index=False)

print(f"\nModel and results saved to {results_path}.")




Fitting 5 folds for each of 4 candidates, totalling 20 fits
MSE (Train): 0.6869, MAE (Train): 0.5666, R2 Score (Train): 0.3098
MSE (Test): 0.7132, MAE (Test): 0.5771, R2 Score (Test): 0.2998
Best Hyperparameters: {'alpha': 10.0}

Model and results saved to C:\Users\izama\Desktop\machine learning\results.
