In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score
from joblib import dump
import pandas as pd
import os
from datetime import datetime
from sklearn.impute import SimpleImputer

# Load Preprocessed Data
file_path = r'...\data\standardized_data.csv'
df = pd.read_csv(file_path)
X = df.drop(columns=['price'])
y = df['price']

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'
X_imputed = imputer.fit_transform(X)

# Split the imputed data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Train Linear Regression Model with Cross-Validation
model = LinearRegression()

# Define RMSE as scoring metric for cross-validation
scorer = make_scorer(mean_squared_error, squared=False)

# Perform 5-fold cross-validation using the imputed data
cv_scores = cross_val_score(model, X_imputed, y, cv=5, scoring=scorer)

# Fit the model on the standard train-test split for comparison
model.fit(X_train, y_train)

# Evaluate Model Performance on the training set
y_train_pred = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

# Evaluate Model Performance on the test split
y_test_pred = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print Cross-Validation Results
print("Cross-Validation RMSE Scores:", cv_scores)
print("Mean CV RMSE:", cv_scores.mean())
print("Standard Deviation of CV RMSE:", cv_scores.std())

# Print Train and Test Set Results
print(f'Train Set Results - MSE: {mse_train}, MAE: {mae_train}, R2 Score: {r2_train}')
print(f'Test Set Results - MSE: {mse_test}, MAE: {mae_test}, R2 Score: {r2_test}')

# Save the Model and Results
results_path = r'...\results'
os.makedirs(results_path, exist_ok=True)  # Create directory if it doesn't exist

# Save model with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_file = os.path.join(results_path, f'linear_regression_model_{timestamp}.joblib')
dump(model, model_file)

# Save results to CSV with timestamp
results_file = os.path.join(results_path, f'linear_regression_results_{timestamp}.csv')
pd.DataFrame([{
    'Model': 'Linear Regression',
    'Train MSE': mse_train,
    'Train MAE': mae_train,
    'Train R2': r2_train,
    'Test MSE': mse_test,
    'Test MAE': mae_test,
    'Test R2': r2_test,
    'CV Mean RMSE': cv_scores.mean(),
    'CV Std RMSE': cv_scores.std()
}]).to_csv(results_file, index=False)

print(f"\nModel and results saved to {results_path}.")




Cross-Validation RMSE Scores: [0.83030271 0.84095125 0.82410886 0.84324833 0.82269665]
Mean CV RMSE: 0.832261560347105
Standard Deviation of CV RMSE: 0.00846180766474405
Train Set Results - MSE: 0.6869076570921281, MAE: 0.5666027469701783, R2 Score: 0.30984254264940014
Test Set Results - MSE: 0.7132382241803303, MAE: 0.5771176644339918, R2 Score: 0.29979137506532316

Model and results saved to C:\Users\izama\Desktop\machine learning\results.


