In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_log_error, mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.pipeline import create_pipeline
from utils.data_cleaning import load_and_clean
from utils import save_figure

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = load_and_clean(verbose=True)

In [None]:
X = df.drop('price', axis=1) #Features

y = df['price'] #Target

In [None]:
model = LinearRegression() #Create linear regression model
pipeline = create_pipeline(df, model=model) #Create pipeline for linear regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #Split data into training and testing sets

In [None]:
# Fit the pipeline and predict
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("y shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)

In [None]:
#Evaluation
print(f"""
      Values of Target 'price':
      {'-' * 80}
      predicted y values:
      {y_pred.tolist()[:10]}

      actual y values:
      {y_test.tolist()[:10]}



      Model Performance Metrics:
      {'-' * 80}
      {'mean squared error:': <30} {mean_squared_error(y_test, y_pred):.5f}
      {'root mean squared error:': <30} {root_mean_squared_error(y_test, y_pred):.5f}
      {'mean squared log error:': <30} {mean_squared_log_error(y_test, y_pred):.5f}
      {'mean absolute error:': <30} {mean_absolute_error(y_test, y_pred):.5f}
      {'r2 score:': <30} {r2_score(y_test, y_pred):.5f}
      """)



In [None]:
#Eval in table
model_name = "Linear Regression"

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Root Mean Squared Error
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

metrics_df = pd.DataFrame({
    "Metric": ["Model Name", "Mean Squared Error (MSE)", "Root Mean Squared Error (RMSE)", "Mean Absolute Error (MAE)", "R² Score"],
    "Score": [model_name, mse, rmse, mae, r2]
})

fig, ax = plt.subplots(figsize=(8, 4))  # Adjust the figure size as needed
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=metrics_df.values, colLabels=metrics_df.columns, loc='center', cellLoc='center')

save_figure(fig, "linear regression metrics", subfolder="linear regression")

In [None]:
# Calculate residuals
residuals = y_test - y_pred

# Plot residuals
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")

# Save the figure
fig = plt.gcf()  # Get the current figure
save_figure(fig, "linear regression residuals", subfolder="linear regression")

# Show the plot
plt.show()

In [None]:
# Plot actual vs predicted
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")

# Save the figure
fig = plt.gcf()  # Get the current figure
save_figure(fig, "linear regression actual vs predicted", subfolder="linear regression")

plt.show()

In [None]:
# Plot distribution of residuals
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, bins=30)
plt.xlabel("Residuals")
plt.title("Distribution of Residuals")

# Save the figure
fig = plt.gcf()  # Get the current figure
save_figure(fig, "linear regression residuals distribution", subfolder="linear regression")

plt.show()

In [None]:
from sklearn.model_selection import learning_curve
import numpy as np

In [None]:
# Generate learning curve
train_sizes, train_scores, test_scores = learning_curve(
    pipeline, X_train, y_train, cv=5, scoring='r2', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
)

# Calculate mean and standard deviation
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, label="Training Score", color="blue")
plt.plot(train_sizes, test_mean, label="Cross-Validation Score", color="orange")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="blue", alpha=0.2)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="orange", alpha=0.2)
plt.xlabel("Training Set Size")
plt.ylabel("R² Score")
plt.title("Learning Curve")
plt.legend()

# Save the figure
fig = plt.gcf()  # Get the current figure
save_figure(fig, "linear regression learning curve", subfolder="linear regression")

plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'model_selection__fit_intercept': [True, False]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='r2',  # Metric to optimize
    n_jobs=-1  # Use all available cores
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

In [None]:
from joblib import dump

dump(pipeline, 'linear_regression.joblib')