# Regression
## Objectives
* Fit and evaluate a regression model for predicting property sale prices using our training and testing datasets.
## Inputs
* outputs\datasets\cleaned\TestSetCleaned.csv
* outputs\datasets\cleaned\TrainSetCleaned.csv
## Outputs
* TrainSet and TestSet
* Data cleaning and feature engineer pipeline 
* Modeling pipeline
* Feature importance analysis
---

# Import Packages

* Import packages using the 'import' statement followed by the name of the package. For example, 'import pandas' which is commonly used for data manipulation and analysis. This is  followed by and alias of your choice, preferably as pd although it is arbitrary.

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Step 1: Load Data
### Change working directory
We need to change the working directory from its current folder to its parent folder

* We access the current directory with os.getcwd()

In [2]:
import os
current_dir = os.getcwd()
current_dir

'C:\\Users\\issam\\Housing-market-analysis.1\\jupyter_notebooks'

We want to make the parent of the current directory the new current directory.

    * os.path.dirname() gets the parent directory
    * os.chir() defines the new current directory

In [3]:
os.chdir(os.path.dirname(current_dir))
# New current directory set

Confirm the new current directory

In [4]:
current_dir = os.getcwd()
current_dir

'C:\\Users\\issam\\Housing-market-analysis.1'

In [5]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs\datasets\cleaned\TrainSetCleaned.csv")
)

print(df.shape)
df.head()

(1160, 22)


Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,GarageArea,GarageFinish,GarageYrBlt,...,LotArea,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,YearBuilt,YearRemodAdd,SalePrice
0,2158,0.0,4.0,Av,477,ALQ,725,576,Unf,1950.0,...,12615,84.0,0.0,29,7,6,1202,1950,2001,243000
1,1614,0.0,3.0,Av,20,GLQ,1594,865,RFn,2005.0,...,11210,86.0,240.0,59,5,7,1614,2005,2006,221500
2,810,672.0,2.873045,No,156,BLQ,516,400,Unf,1934.0,...,12155,70.243187,0.0,0,8,6,672,1925,1950,140000
3,894,0.0,3.0,No,492,BLQ,402,450,Non,1968.0,...,8724,109.0,0.0,0,5,5,894,1968,1968,129000
4,864,0.0,3.0,No,0,Unf,864,280,Non,1972.0,...,9353,71.0,0.0,0,5,4,864,1970,1970,116050


## Transformations 
* Create engineered variables and integrate transformations

In [6]:
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder as FE_OrdinalEncoder
import scipy.stats as stats
import matplotlib.pyplot as plt
from IPython.display import Image, display
sns.set(style="whitegrid")
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler

# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Load the cleaned dataset 
# The dataset is already cleaned and ready for transformation
train_data = pd.read_csv('outputs\datasets\cleaned\TrainSetCleaned.csv') 
# Assuming you have the train set in a file called train_data.csv
test_data = pd.read_csv('outputs\datasets\cleaned\TestSetCleaned.csv') 
# Assuming you have the test set in a file called test_data.csv
# Separate the features and target variable in the training set
X_train = train_data.drop('SalePrice', axis=1) 
# Assuming 'SalePrice' is the target variable
y_train = train_data['SalePrice'] 
# Separate the features and target variable in the test set
X_test = test_data.drop('SalePrice', axis=1) 
# Assuming 'SalePrice' is the target variable
y_test = test_data['SalePrice'] 
# Define preprocessing steps for numerical and categorical variables
numeric_features = X_train.columns[X_train.dtypes!='object'].to_list()
transformed_num_features = ['GarageArea', 'GrLivArea']
no_transform_num_features = [item for item in numeric_features if item not in transformed_num_features]
# Replace with actual numerical feature names
# Use FunctionTransformer to apply different transformations to different features
numeric_transformer = Pipeline(steps=[
    ('power_tranform',
ColumnTransformer(
    transformers=[
        ('log_GarageArea', FunctionTransformer(np.log1p, validate=True), ['GarageArea']),
        ('log_GrLivArea', FunctionTransformer(np.log1p, validate=True), ['GrLivArea']), 
        ('no_transform', 'passthrough', no_transform_num_features) 
    ]))])
# For the categorical features, use OrdinalEncoder with encoding based on the mean of SalePrice
categorical_features = X_train.select_dtypes(include=['object']).columns.to_list()
# ['KitchenQual'] 
arbitrary_encoder_features = [feature for feature in categorical_features if feature != 'KitchenQual']
print(arbitrary_encoder_features)
# Create a new feature based on the mean SalePrice per category for KitchenQual
mean_sale_price_per_kitchen_qual = train_data.groupby('KitchenQual')['SalePrice'].mean().sort_values().index.to_list()


# Define a custom encoder using the mean of SalePrice for encoding KitchenQual
encoder_kitchen_qual = FE_OrdinalEncoder(encoding_method='ordered', variables=['KitchenQual'])
# Define an arbitrary encoder for other categorical features
encoder_others = FE_OrdinalEncoder(encoding_method='arbitrary', variables=arbitrary_encoder_features)

categorical_transformer = 
Pipeline(steps=[
    ('ordinal_encoding_kitchen_qual', encoder_kitchen_qual),
    ('ordinal_encoding_others', encoder_others) 
    # Replace with actual categorical feature names except KitchenQual
])

# Combine preprocessing steps for numerical and categorical variables using ColumnTransformer
preprocessor = ColumnTransformer( 
    transformers=[
        ('num', numeric_transformer, numeric_features), 
        ('cat', categorical_transformer, categorical_features) 
    ]
)
# Create the full pipeline including preprocessing, PCA, and model training 

pipeline = Pipeline(steps=[ 
    ('preprocessor', preprocessor),# Apply preprocessing steps 
    ('pca', PCA(n_components=10)),# Apply PCA for dimensionality reduction 
    ('classifier', LogisticRegression()) # Train logistic regression model 
]) 

# Fit the pipeline on training data and make predictions 
pipeline.fit(X_train, y_train) 
# Fit the pipeline on training data 
# and make predictions using the fitted pipeline
y_pred = pipeline.predict(X_test) 
# Evaluate the model
print(f"Accuracy: {pipeline.score(X_test, y_test)}")

SyntaxError: invalid syntax (4127359540.py, line 62)

## Step 2: ML Pipeline with all data
### ML pipeline for Data Cleaning and Feature Engineering

In [None]:

# 1 - create two encoders for categorical variables
# Encoder for KitchenQual
kitchen_qual_encoder = OrdinalEncoder(dict(KitchenQual=['Po','Fa','TA','Gd','Ex']), encoding_method='arbitrary',
variables = ['KitchenQual'])
categorical_variables.remove('KitchenQual')

# Create encoder for other categorical variables
encoder = OrdinalEncoder(encoding_method='arbitrary', variables = categorical_variables)

# 2 - fit_transform into TrainSet
print(TrainSet.head(10))
TrainSet['KitchenQualencoded'] = kitchen_qual_encoder.fit_transform(TrainSet['KitchenQual'])
TrainSet = encoder.fit_transform(TrainSet)
# print(TrainSet['categorical_variables'].head(10))
print(TrainSet.head(10))

# 3 - transform into TestSet 
TestSet['KitchenQual'] = kitchen_qual_encoder.transform(TestSet['KitchenQual'])

# 1. Create a transformer
scaler = StandardScaler()

# 2. Fit-Transform into TrainSet
TrainSet[numerical_variables] = scaler.fit_transform(TrainSet[numerical_variables])

# 3. Transform into TestSet
TestSet[numerical_variables] = scaler.transform(TestSet[numerical_variables])

df_numerical_variables = TrainSet.copy()
df_numerical_variables.head()

from feature_engine.selection import SmartCorrelatedSelection
corr_sel = SmartCorrelatedSelection(variables=None, method="spearman", threshold=0.6, selection_method="variance")

corr_sel.fit_transform(df_numerical_variables)
corr_sel.correlated_feature_sets_

corr_sel.features_to_drop_

#############################################################################################

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import SelectFromModel
from feature_engine.selection import SmartCorrelatedSelection


def PipelineDataCleaningAndFeatureEngineering():
    # Converts Objects into Ints
    df['BsmtExposure'] = df['BsmtExposure'].astype('category').cat.codes
    df['BsmtFinType1'] = df['BsmtFinType1'].astype('category').cat.codes
    df['GarageFinish'] = df['GarageFinish'].astype('category').cat.codes
    df['KitchenQual'] = df['KitchenQual'].astype('category').cat.codes

    
    # Combine preprocessing with feature selection
    pipeline_base = Pipeline([
        ("feature_selection", SelectFromModel(RandomForestRegressor())),
        ("SmartCorrelatedSelection", SmartCorrelatedSelection(variables=None,
                                                              method="pearson", threshold=0.9, selection_method="variance")),
    ])

    return pipeline_base

PipelineDataCleaningAndFeatureEngineering()

In [None]:
# All categorical features have been converted into numerical features
df.info()

### ML Pipeline for Modelling and Hyperparameter Optimisation

In [None]:
# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
# from xgboost import XGBRegressor


def PipelineReg(model):
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        ("feat_selection", SelectFromModel(model)),
        ("model", model),
    ])

    return pipeline_base

### Custom Class for Hyperparameter Optimisation

In [None]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model = PipelineReg(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches

### Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['SalePrice'], axis=1),
    df['SalePrice'],
    test_size=0.2,
    random_state=101,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
pipeline_data_cleaning_feat_eng = PipelineDataCleaningAndFeatureEngineering()
X_train_original = X_train
print(X_train_original.shape)
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train, y_train) 
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Handle Target Imbalance 

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a random seed
np.random.seed(101)

sns.set_style("whitegrid")

#subsampled_counts = y_train.value_counts().sample(n=20)  # Adjust the sample size as needed
subsampled_counts = y_train.value_counts().sample(n=min(20, y_train.nunique()))
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
ax = subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')
ax.tick_params(axis='x', rotation=45)# Rotate x-axis labels
print(len(y_train))
plt.show()


Use algorithms that handle class imbalance

In [None]:
from imblearn.under_sampling import NearMiss

# Define the NearMiss undersampler
undersampler = NearMiss(version=1, n_neighbors=1)

# Apply NearMiss undersampling to the dataset
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)


Check Train Set Target distribution after resampling

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Set a random seed
np.random.seed(102)

sns.set_style("whitegrid")

#subsampled_counts = y_train.value_counts().sample(n=20)  # Adjust the sample size as needed
subsampled_counts = y_train.value_counts().sample(n=min(20, y_train.nunique()))
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
ax = subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')
ax.tick_params(axis='x', rotation=45)# Rotate x-axis labels
plt.show()

### Grid Search CV - Sklearn
#### Use standard hyperparameters to find most suitable algorithm

In [None]:
models_search = {
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=101),
    "RandomForestRegressor": RandomForestRegressor(random_state=101),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=101),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=101),# algorithm='SAMME'
}

params_search = {
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
}


In [None]:
print(y_train.unique())


Quick GridSearch CV - Binary Classifier

In [None]:
from sklearn.metrics import make_scorer, r2_score

search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(r2_score),
           n_jobs=-1, cv=3)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

#### Do an extensive search on the most suitable algorithm to find the best hyperparameter configuration.
Define model and parameters, for Extensive Search

In [None]:
from sklearn.ensemble import RandomForestRegressor

models_search = {
    "RandomForestRegressor": RandomForestRegressor(random_state=101),
}

# Documentation to help on hyperparameter list:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

# We will not conduct an extensive search, since the focus
# is on how to combine all knowledge in an applied project.
# In a workplace project, you may spend more time in this step
params_search = {
    "RandomForestRegressor": {
        'model__n_estimators': [100,150,200],
        'model__max_depth': [None, 10, 20,],
    }
}

Extensive GridSearch CV - Binary Classifier

In [None]:
from sklearn.metrics import r2_score, make_scorer
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, 
           scoring =  make_scorer(r2_score,),
           n_jobs=-1, cv=3)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

Get best model name programmatically

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best clf pipeline

In [None]:
pipeline_reg = grid_search_pipelines[best_model].best_estimator_
print(pipeline_reg['feat_selection'].get_support())
pipeline_reg

### Assess feature importance

In [None]:
import pandas as pd

# Convert X_train to a pandas DataFrame
X_train_df = pd.DataFrame(X_train)

# Now use the .tail() method
X_train_df.tail()


* With the current model, we can assess with .features_importances_

In [None]:
import pandas as pd

# Assuming X_train is your NumPy array ()
#X_train_df = pd.DataFrame(X_train_original)
print(X_train.shape)
print(X_train_df.shape)

# Now you can access the columns attribute 
columns = X_train_df.columns
print(pipeline_reg)
print(' next  ')
print(columns)

# Access feature importances from the feature selection step
feat_selector = pipeline_reg.named_steps['feat_selection'].estimator_
feat_selector_importances = feat_selector.feature_importances_

print("feat selector:",feat_selector)
print("Feature importances from SelectFromModel step:", feat_selector_importances)


# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    # 'Feature': columns[pipeline_reg['feat_selection'].get_support()],
    'Feature': columns,
    # 'Importance': pipeline_reg['model'].feature_importances_
    'Importance':feat_selector_importances})
    .sort_values(by='Importance', ascending=False))
                                    

# re-assign best_features order
best_features = df_feature_importance['Feature'].to_list()

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

### Evaluate Pipeline on Train and Test Sets

In [None]:
from sklearn.metrics import mean_squared_error, r2_score


def reg_performance(X_train, y_train, X_test, y_test, pipeline):
    print("#### Train Set #### \n")
    y_train_pred = pipeline.predict(X_train)
    print("Mean Squared Error (Train):", mean_squared_error(y_train, y_train_pred))
    print("R2 Score (Train):", r2_score(y_train, y_train_pred))

    print("\n#### Test Set ####\n")
    y_test_pred = pipeline.predict(X_test)
    print("Mean Squared Error (Test):", mean_squared_error(y_test, y_test_pred))
    print("R2 Score (Test):", r2_score(y_test, y_test_pred))

    # Plot Actual vs. Predicted for training data
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.scatter(y_train, y_train_pred, alpha=0.5)
    plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], '--', color='red')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Actual vs. Predicted (Training)')
    
    # Plot Actual vs. Predicted for test data
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_test_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--', color='red')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Actual vs. Predicted (Test)')
    
    plt.tight_layout()
    plt.show()

    # from sklearn.pipeline import Pipeline


---


In [None]:
import warnings

# Suppress undefined metric warnings
warnings.filterwarnings("ignore", category=UserWarning)


Evaluation: We cross check with metrics defined at ML business case


In [None]:
reg_performance(X_train=X_train, y_train=y_train,
                 X_test=X_test, y_test=y_test,
                 pipeline=pipeline_reg,
                )

print(y_train)

## Step 3: Refit pipeline with best features
### Refit ML Pipeline and Resampling
In theory, a pipeline fitted **using only the most important features** should give the same result as the one fitted with **all variables and feature selection**.

### Rewrite ML pipeline for Data Cleaning and Feature Engineering

In [None]:
best_features

New Pipeline for DataCleaning And FeatureEngineering

In [None]:
from feature_engine.encoding import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


def PipelineDataCleaningAndFeatureEngineering():
    pipeline_base = Pipeline([
    
        # Scale numerical features
        ("Scaler", StandardScaler()),  
        ("feature_selection", SelectFromModel(RandomForestRegressor())),
    ])

    return pipeline_base



### Rewrite ML Pipeline for Modelling
Function for Pipeline optmisation

In [None]:
# Pipeline Optimization: Model
def PipelineReg(model):
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model),
    ])

    return pipeline_base

### Split Train Test Set, considering only with best features

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target variable (y)
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split
X_train, X_test, y_train, y_test,  = train_test_split(
    X, y,
    test_size=0.2,
    random_state=101,
)

### Handle Target Imbalance

In [None]:
pipeline_data_cleaning_feat_eng = PipelineDataCleaningAndFeatureEngineering()

X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train, y_train)
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt

# Set figure size
plt.figure(figsize=(10, 6))

np.random.seed(101)

# Plot bar chart
counts = y_train.value_counts()
# Subsample the counts
subsampled_counts = counts.sample(n=min(20, len(counts)))  

subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')

# Set title and labels
plt.title('Train Set Target Distribution')
plt.xlabel('Target Value')
plt.ylabel('Count')

# Rotate x-axis labels
plt.xticks(rotation=45)

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show plot
plt.show()

Use NearMiss to balance Train Set target

In [None]:
from imblearn.under_sampling import NearMiss

# Define the NearMiss undersampler
undersampler = NearMiss(version=1, n_neighbors=1)

# Apply NearMiss undersampling to the dataset
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

Check Train Set Target distribution after NearMiss

In [None]:
import matplotlib.pyplot as plt

# Set figure size
plt.figure(figsize=(10, 6))

np.random.seed(101)

# Plot bar chart
counts = y_train.value_counts()
# Subsample the counts
subsampled_counts = counts.sample(n=min(20, len(counts)))  

subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')

# Set title and labels
plt.title('Train Set Target Distribution')
plt.xlabel('Target Value')
plt.ylabel('Count')

# Rotate x-axis labels
plt.xticks(rotation=45)

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show plot
plt.show()

### Grid Search CV: Sklearn
Using the most suitable model from the last section and its best hyperparameter configuration.

We are using the same model from the last GridCV search

In [None]:
models_search  

And the best parameters from the last GridCV search

In [None]:
best_parameters

You will need to type in manually since the hyperparameter values have to be a list. The previous dictionary is not in this format.

In [None]:
params_search = {
    'RandomForestRegressor': {
        'model__n_estimators': [100, 300, 500],
        'model__max_depth': [None, 10, 30],
    }
}

GridSearch CV

In [None]:
import pandas as pd 
from sklearn.metrics import make_scorer, r2_score

quick_search = HyperparameterOptimizationSearch(
    models=models_search, params=params_search)
quick_search.fit(X_train, y_train,
                 scoring=make_scorer(r2_score, ),
                 n_jobs=-1, cv=3)

print(X_train.shape)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary 

Define the best clf pipeline

In [None]:
best_model = grid_search_summary.iloc[0, 0]
pipeline_reg = grid_search_pipelines[best_model].best_estimator_
pipeline_reg

### Assess feature importance

In [None]:
print("best_features:", best_features)
print("feature_importances:", pipeline_reg['model'].feature_importances_)
print("Length of best_features:", len(best_features))
print("Length of feature_importances:", len(pipeline_reg['model'].feature_importances_))

In [None]:
# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': columns.tolist(),
    'Importance': pipeline_reg['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)


# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

### Evaluate Pipeline on Train and Test Sets
Evaluation: We cross-check with metrics defined in the ML business case.


In [None]:
reg_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_reg,
                )

# Step 4: Push files to Repo
We will generate the following files

* Train set
* Test set
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* features importance plot

In [None]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_pipeline/predict_saleprice/{version}'

try:
    os.makedirs(name=file_path)
except Exception as e:
    print(e)

## Train Set

In [None]:
print(X_train.shape)
# X_train.head()

I'm not sure when or how X_train became a numpy array. but I used the pd.DataFrame() method to convert it back into a dataframe.

In [None]:
import pandas as pd

# Convert X_train to a DataFrame
X_train_df = pd.DataFrame(X_train)

# Save the DataFrame to a CSV file
X_train_df.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
X_train_df.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test Set
* note that the variables are transformed already in X_test

In [None]:
import pandas as pd

# Convert X_test to a DataFrame
X_test_df = pd.DataFrame(X_test)

# Now you can use DataFrame methods like head()
X_test_df.head()

In [None]:
print(X_test_df.shape)
X_test_df.head()

In [None]:
X_test_df.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test.head()

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

## ML Pipelines: Data Cleaning and Feat Eng pipeline and Modelling Pipeline
We will save 2 pipelines:

* Both should be used in conjunction to predict Live Data.
* To predict on Train Set, Test Set we use only pipeline_reg, since the data is already processed.
Pipeline responsible for Data Cleaning and Feature Engineering.

In [None]:
pipeline_data_cleaning_feat_eng

In [None]:
joblib.dump(value=pipeline_data_cleaning_feat_eng ,
            filename=f"{file_path}/clf_pipeline_model.pkl")
joblib.dump(value=pipeline_reg ,
            filename=f"{file_path}/clf_pipeline_predict.pkl")



### Feature Importance plot

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance', figsize=(3, 6))
plt.show()

In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance', figsize=(3,6))
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')