# Modeling and Evaluation - Predict House Price

## Objectives

* Fit and evaluate a classification model to predict a house sale price when the house features are known.

## Inputs
* outputs/datasets/collection/housing_data_cleaned.csv
* Feature engineering/data cleaning from the feature engineering notebook.

## Outputs 
* Train set
* Test set
* ML pipeline to predict house sale prices



---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/PP5-ML-PROJECT/jupyter_notebooks'

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/PP5-ML-PROJECT'

# Load Data

In [4]:
import pandas as pd
import numpy as np
df = (pd.read_csv("outputs/datasets/collection/housing_data_cleaned.csv"))
df.head(5)

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,GarageArea,GarageFinish,GarageYrBlt,...,LotArea,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,YearBuilt,YearRemodAdd,SalePrice
0,856.0,854.0,3.0,1.0,706.0,6.0,150.0,548.0,2.0,2003.0,...,8450.0,65.0,196.0,61.0,5,7,856.0,2003,2003,208500.0
1,1262.0,0.0,3.0,4.0,978.0,5.0,284.0,460.0,2.0,1976.0,...,9600.0,80.0,0.0,0.0,8,6,1262.0,1976,1976,181500.0
2,920.0,866.0,3.0,2.0,486.0,6.0,434.0,608.0,2.0,2001.0,...,11250.0,68.0,162.0,42.0,5,7,920.0,2001,2002,223500.0
3,961.0,0.0,3.0,1.0,216.0,5.0,540.0,642.0,1.0,1998.0,...,9550.0,60.0,0.0,35.0,5,7,756.0,1915,1970,140000.0
4,1145.0,0.0,4.0,3.0,655.0,6.0,490.0,836.0,2.0,2000.0,...,14260.0,84.0,350.0,84.0,5,8,1145.0,2000,2000,250000.0


## Create ML Pipeline

In [5]:
### Custom Encoder
from sklearn.base import BaseEstimator, TransformerMixin
class MyCustomEncoder(BaseEstimator, TransformerMixin):

  def __init__(self, variables, dic):
    if not isinstance(variables, list): 
      self.variables = [variables]
    else: self.variables = variables
    self.dic = dic

  def fit(self, X, y=None):    
    return self

  def transform(self, X):
    for col in self.variables:
      if X[col].dtype == 'object':
        X[col] = X[col].replace(dic[col])
      else:
        print(f"Warning: {col} data type should be object to use MyCustomEncoder()")
      
    return X

- **Import all necassary ML algorithms**

### Bring in the pipeline from the feature engineering notebook with feature scaling, feature selection and model added:

In [6]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import DropFeatures
from feature_engine.imputation import MeanMedianImputer
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

# ML algorithms 
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor

def PipelineOptimization(dic, vars_with_missing_data, model):
    
      pipeline = Pipeline([
            # ('drop_features', DropFeatures(features_to_drop = ['EnclosedPorch', 'WoodDeckSF'])),
             
            ('custom_encoder', MyCustomEncoder(variables=['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual'], dic=dic)),       
            
            ('median_imputer',  MeanMedianImputer(imputation_method='median', variables=vars_with_missing_data)),
            
            ('corr_sel', SmartCorrelatedSelection(method="spearman", threshold=0.9, selection_method="variance")),
            
            ("feat_scaling", StandardScaler() ),

            ("feat_selection",  SelectFromModel(model) ),

            ("model", model ),
      ])

      return pipeline


- **Hyperparameter optimization**: Custom hyperparameter optimization to fit my model

In [7]:
from sklearn.model_selection import GridSearchCV

class HyperparameterOptimizationSearch:

    def __init__(self, dic, vars_with_missing_data, models, params):
        self.dic = dic
        self.vars_with_missing_data = vars_with_missing_data
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model=  PipelineOptimization(dic, vars_with_missing_data, self.models[key],)

            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(np.array(r).reshape(-1, 1))


            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches
        

## Split the data into Train and Test sets

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['SalePrice'], axis=1),
    df['SalePrice'],
    test_size=0.2,
    random_state=0,
)

print('Train Set:', X_train.shape, y_train.shape, 'Test Set:', X_test.shape, y_test.shape)

Train Set: (1168, 21) (1168,) Test Set: (292, 21) (292,)


## Grid Search CV from Scikit Learn

Lets try 7 ML models to find the best one

In [9]:
models_quick_search = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    "LinearRegression": {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}

In [10]:
dic = {'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0}, 'BsmtFinType1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0}, 'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0}, 'KitchenQual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}}
vars_with_missing_data = ['2ndFlrSF', 'BedroomAbvGr', 'BsmtFinType1', 'GarageFinish', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea']     


# Train 5 models, 1 for each of 5 cross validations for each algorithm and the hyperparameters

In [None]:
search = HyperparameterOptimizationSearch(dic=dic, vars_with_missing_data=vars_with_missing_data, models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)


Running GridSearchCV for LinearRegression 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for DecisionTreeRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for RandomForestRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for ExtraTreesRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for AdaBoostRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for GradientBoostingRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for XGBRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):




  if is_sparse(data):




  if is_sparse(data):
  if is_sparse(data):


## Lets check the results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

## Use hyperparameter optimization on the top 4 models using hyperparameter combinations

In [13]:
models_search = {
    "LinearRegression": LinearRegression(),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
}

params_search = {
    "LinearRegression":{},
    "GradientBoostingRegressor":{'model__n_estimators': [100,50,140],
                                 'model__learning_rate':[0.1, 0.01, 0.001],
                                 'model__max_depth': [3,15, None],
                                 'model__min_samples_split': [2,50],
                                 'model__min_samples_leaf': [1,50],
                                 'model__max_leaf_nodes': [None,50],
    },
    "ExtraTreesRegressor":{'model__n_estimators': [100,50,150],
                           'model__max_depth': [None, 3, 15],
                           'model__min_samples_split': [2, 50],
                           'model__min_samples_leaf': [1,50],
    },
    "RandomForestRegressor":{'model__n_estimators': [100,50, 140],
                             'model__max_depth': [None,4, 15],
                             'model__min_samples_split': [2,50],
                             'model__min_samples_leaf': [1,50],
                             'model__max_leaf_nodes': [None,50],
    },
}

## Train our models with Grid Search CV

* Totalling 1625 models

In [None]:
search = HyperparameterOptimizationSearch(dic=dic, vars_with_missing_data=vars_with_missing_data, models=models_search, params=params_search)

search.fit(X_train, y_train, scoring= 'r2', n_jobs=1, cv=5)

## Lets check the results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary.head(50)

### Now we will look into all 4 algorithms

## **1. ExtraTreesRegressor**

In [None]:
best_model = grid_search_summary.iloc[1,0]
best_model

* Best model parameters

In [None]:
grid_search_pipelines[best_model].best_params_

* Best model pipeline

In [None]:
best_pipeline = grid_search_pipelines[best_model].best_estimator_
best_pipeline

## Assess the Feature Importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.pipeline import Pipeline

sns.set_style('whitegrid')

# Apply pipeline transformation to X_train
transformed_array = Pipeline(best_pipeline.steps[:4]).transform(X_train)

# Extract selected features correctly
selected_mask = best_pipeline['feat_selection'].get_support()
selected_features = [col for col, keep in zip(X_train.columns, selected_mask) if keep]

# Extract final selected features
best_features = selected_features

# Create DataFrame with selected feature importance
df_feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': best_pipeline['model'].feature_importances_
}).sort_values(by='Importance', ascending=False)

# Print the most important features
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

# Plot feature importance
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')

plt.show()


### Evaluate Train and Test Sets

In [24]:
# Following code is from a SKlearn lesson from Code Institute
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error 
import numpy as np

def regression_performance(X_train, y_train, X_test, y_test,pipeline):
	print("Model Evaluation \n")
	print("* Train Set")
	regression_evaluation(X_train,y_train,pipeline)
	print("* Test Set")
	regression_evaluation(X_test,y_test,pipeline)

def regression_evaluation(X,y,pipeline):
  prediction = pipeline.predict(X)
  print('R2 Score:', r2_score(y, prediction).round(3))  
  print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))  
  print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))  
  print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y, prediction)).round(3))
  print("\n")


def regression_evaluation_plots(X_train, y_train, X_test, y_test,pipeline, alpha_scatter=0.5):
  pred_train = pipeline.predict(X_train)
  pred_test = pipeline.predict(X_test)


  fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,6))
  sns.scatterplot(x=y_train , y=pred_train, alpha=alpha_scatter, ax=axes[0])
  sns.lineplot(x=y_train , y=y_train, color='red', ax=axes[0])
  axes[0].set_xlabel("Actual")
  axes[0].tick_params(axis='x', rotation=90)
  axes[0].set_ylabel("Predictions")
  axes[0].set_title("Train Set")

  sns.scatterplot(x=y_test , y=pred_test, alpha=alpha_scatter, ax=axes[1])
  sns.lineplot(x=y_test , y=y_test, color='red', ax=axes[1])
  axes[1].set_xlabel("Actual")
  axes[1].tick_params(axis='x', rotation=90)
  axes[1].set_ylabel("Predictions")
  axes[1].set_title("Test Set")

plt.show()

### Evaluate Model Performance

In [None]:
regression_performance(X_train, y_train, X_test, y_test,best_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_pipeline)


**The R2 score on the test set is 0.77, this means it passes the goal to perform above an R2 score of at least 0.75. This model is trained on 5 features.**

## **2. RandomForestRegressor**

In [None]:
best_model = grid_search_summary.iloc[45,0]
best_model

* Best model parameters

In [None]:
grid_search_pipelines[best_model].best_params_

* Best model pipeline

In [None]:
best_pipeline = grid_search_pipelines[best_model].best_estimator_
best_pipeline

## Assess the Feature Importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.pipeline import Pipeline

sns.set_style('whitegrid')

# Apply pipeline transformation to X_train
transformed_array = Pipeline(best_pipeline.steps[:4]).transform(X_train)

# Extract selected features correctly
selected_mask = best_pipeline['feat_selection'].get_support()
selected_features = [col for col, keep in zip(X_train.columns, selected_mask) if keep]

# Extract final selected features
best_features = selected_features

# Create DataFrame with selected feature importance
df_feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': best_pipeline['model'].feature_importances_
}).sort_values(by='Importance', ascending=False)

# Print the most important features
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

# Plot feature importance
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()


### Evaluate Model Performance

In [None]:
regression_performance(X_train, y_train, X_test, y_test,best_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_pipeline)


**The R2 score on the test set is 0.753, this means it passes the goal to perform above an R2 score of at least 0.75. This is a great result and may be the best model, however it may have trouble predicting price above £400,000**

## 3. GradientBoostingRegressor

In [None]:
best_model = grid_search_summary.iloc[7,0]
best_model

* Best model parameters

In [None]:
grid_search_pipelines[best_model].best_params_

* Best model pipeline

In [None]:
best_pipeline = grid_search_pipelines[best_model].best_estimator_
best_pipeline

## Assess Feature Importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.pipeline import Pipeline

sns.set_style('whitegrid')

# Apply pipeline transformation to X_train
transformed_array = Pipeline(best_pipeline.steps[:4]).transform(X_train)

# Extract selected features correctly
selected_mask = best_pipeline['feat_selection'].get_support()
selected_features = [col for col, keep in zip(X_train.columns, selected_mask) if keep]

# Extract final selected features
best_features = selected_features

# Create DataFrame with selected feature importance
df_feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': best_pipeline['model'].feature_importances_
}).sort_values(by='Importance', ascending=False)

# Print the most important features
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

# Plot feature importance
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

### Evaluate Model Performance

In [None]:
regression_performance(X_train, y_train, X_test, y_test,best_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_pipeline)

**The R2 score on the test set is 0.76. It passes the R2 performance requirement of 0.75 however the model is has not predicted any values above circa £450,000.**

## 4. LinearRegression

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

* Best model parameters

In [None]:
grid_search_pipelines[best_model].best_params_

* Best model pipeline

In [None]:
best_pipeline = grid_search_pipelines[best_model].best_estimator_
best_pipeline


## Assess Feature Importance

LinearRegression has no feature importance so we can remove the df_feature_importance from this assessment

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.pipeline import Pipeline

sns.set_style('whitegrid')

# Apply pipeline transformation to X_train
transformed_array = Pipeline(best_pipeline.steps[:4]).transform(X_train)

# Extract selected features correctly
selected_mask = best_pipeline['feat_selection'].get_support()
selected_features = [col for col, keep in zip(X_train.columns, selected_mask) if keep]


### Evaluate Model Performance

In [None]:
regression_performance(X_train, y_train, X_test, y_test,best_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_pipeline)

**The R2 score on the test set is 0.64. It does not pass the R2 performance requirement of 0.75.**

# Refit the pipeline with best features

**We chose to fit RandomForestRegressor**

In [None]:
best_model = grid_search_summary.iloc[45,0]
best_model

In [None]:
grid_search_pipelines[best_model].best_params_


In [None]:
best_pipeline = grid_search_pipelines[best_model].best_estimator_
best_pipeline

## Assess Feature Importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.pipeline import Pipeline

sns.set_style('whitegrid')

# Apply pipeline transformation to X_train
transformed_array = Pipeline(best_pipeline.steps[:4]).transform(X_train)

# Extract selected features correctly
selected_mask = best_pipeline['feat_selection'].get_support()
selected_features = [col for col, keep in zip(X_train.columns, selected_mask) if keep]

# Extract final selected features
best_features = selected_features

# Create DataFrame with selected feature importance
df_feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': best_pipeline['model'].feature_importances_
}).sort_values(by='Importance', ascending=False)

# Print the most important features
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

# Plot feature importance
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()


**The RandomForestRegressor pipeline is only trained on 3 features out of the 23 so we updated the pipeline, training set and test set so that the model is smaller and can be more efficient when predicting sale price**

In [60]:
def PipelineOptimization(dic, vars_with_missing_data, model):
    
      pipeline = Pipeline([
            ("feat_scaling", StandardScaler()),
            ("model", RandomForestRegressor(n_estimators=300, max_depth=20, min_samples_split=5, min_samples_leaf=2))
])


      return pipeline

## Update the Train Set and Test Set

In [None]:
X_train = X_train.filter(best_features)
X_test = X_test.filter(best_features)

print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)
X_train.head(3)

## Grid Search CV - SciKit Learn

In [62]:
models_search = {
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
}

params_search = {
    "RandomForestRegressor": {
        'model__n_estimators': [200, 300],
        'model__max_depth': [10, 20, None],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 5],
        'model__max_leaf_nodes': [None, 100]
    }
}


### Train the new models with GridSearchCV

In [None]:
search = HyperparameterOptimizationSearch(dic=dic, vars_with_missing_data=vars_with_missing_data, models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)

### Show the Results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

In [None]:
best_pipeline = grid_search_pipelines[best_model].best_estimator_
best_pipeline


In [None]:
regression_performance(X_train, y_train, X_test, y_test, best_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_pipeline)

### Refit the model with 2 strongly correlated features

Import everything we'll need

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, r2_score

* **First lets get the 2 most strongly correlated features**

In [None]:
corr_matrix = df.corr()

# Get top 2 features correlated with SalePrice (excluding SalePrice itself)
top_features = corr_matrix["SalePrice"].drop("SalePrice").abs().sort_values(ascending=False).head(2).index.tolist()

print("Top 2 features correlated with SalePrice:", top_features)


* As expected we can see that they are the Overall Quality and the Groud Living Area m^2.

### Split the new train and test data

In [None]:
# Use only the top 2 correlated features
X = df[top_features]  
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape, X_test.shape)

#### **Train the Models**

In [None]:
# Define models in a dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=0),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=0),
    "Extra Trees": ExtraTreesRegressor(n_estimators=100, random_state=0),
}

# Train, predict, and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} Performance with Top 2 Features:\nMAE: {mae:.2f}, R²: {r2:.4f}\n")


* **As we can see there are only 2 models here that are over the 0.75 that the client asked for which are the 'GradientBoostingRegressor' and 'RandomForestRegressor'. The 'GradientBoostingRegressor' performance is much better as it has a lower MAE and higher R2 score meaning it can more accurately predict 'SalePrice'**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

# Initialize the model
gb_model = GradientBoostingRegressor(random_state=0)

# Perform Grid Search
grid_search = GridSearchCV(gb_model, param_grid, cv=5, n_jobs=-1, scoring='r2', verbose=2)
grid_search.fit(X_train, y_train)

best_gb_model = grid_search.best_estimator_

# Print best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the tuned model
y_pred = grid_search.best_estimator_.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Tuned Gradient Boosting Model Performance:\nMAE: {mae:.2f}, R²: {r2:.4f}")


* After tuning the GDR we see that it still has a lower MAE score and high enough R2 score we can now use this to predict house prices

In [None]:
# Make predictions on the test set
y_pred = best_gb_model.predict(X_test)

# Compare the first few predictions with actual values
predictions_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
print(predictions_df.head(20))  # Show first 10 predictions


### Save all the files neccassary from the notebook

In [None]:
import joblib
import os

version = 'v1'
file_path = f'outputs/predict_price/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

#### Save Train Set:

In [None]:
X_train.head()


In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train.head()

In [74]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

#### Save Test Set:

In [None]:
X_test.head()

In [76]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test.head()

In [78]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

#### Modeling Pipeline

In [None]:
best_gb_model

In [None]:
joblib.dump(value=best_pipeline, filename=f"{file_path}/best_pipeline.pkl")