# Classification
## Objectives
* Fit and evaluate a classification model to predict if a prospect will churn or not.
## Inputs
* outputs/datasets/collection/house-price-2021.csv
## Outputs
* Train set (features and target)
* Test set (features and target)
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* Feature importance plot
---
## Change working directory
We need to change the working directory from its current folder to its parent folder

* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory.

    * os.path.dirname() gets the parent directory
    * os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

## Step 1: Load Data

In [None]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/house-price-2021.csv")
      .drop(labels=['WoodDeckSF', 'EnclosedPorch'], axis=1)  
                    
  )

print(df.shape)
df.head()

## Step 2: ML Pipeline with all data
### ML pipeline for Data Cleaning and Feature Engineering

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import SelectFromModel
from feature_engine.selection import SmartCorrelatedSelection


def PipelineDataCleaningAndFeatureEngineering():
    # Converts Objects into Ints
    df['BsmtExposure'] = df['BsmtExposure'].astype('category').cat.codes
    df['BsmtFinType1'] = df['BsmtFinType1'].astype('category').cat.codes
    df['GarageFinish'] = df['GarageFinish'].astype('category').cat.codes
    df['KitchenQual'] = df['KitchenQual'].astype('category').cat.codes

    
    # Combine preprocessing with feature selection
    pipeline_base = Pipeline([
        ("feature_selection", SelectFromModel(RandomForestRegressor())),
        ("SmartCorrelatedSelection", SmartCorrelatedSelection(variables=None,
                                                              method="pearson", threshold=0.9, selection_method="variance")),
    ])

    return pipeline_base

PipelineDataCleaningAndFeatureEngineering()

### ML Pipeline for Modelling and Hyperparameter Optimisation

In [None]:
# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor


def PipelineReg(model):
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        ("feat_selection", SelectFromModel(model)),
        ("model", model),
    ])

    return pipeline_base

### Custom Class for Hyperparameter Optimisation

In [None]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model = PipelineReg(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches

### Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['SalePrice'], axis=1),
    df['SalePrice'],
    test_size=0.2,
    random_state=101,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train.shape)


In [None]:
pipeline_data_cleaning_feat_eng = PipelineDataCleaningAndFeatureEngineering()
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train, y_train) 
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Handle Target Imbalance 

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a random seed
np.random.seed(101)

sns.set_style("whitegrid")

#subsampled_counts = y_train.value_counts().sample(n=20)  # Adjust the sample size as needed
subsampled_counts = y_train.value_counts().sample(n=min(20, y_train.nunique()))
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
ax = subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')
ax.tick_params(axis='x', rotation=45)# Rotate x-axis labels
plt.show()


Use algorithms that handle class imbalance

In [None]:
from imblearn.under_sampling import NearMiss

# Define the NearMiss undersampler
undersampler = NearMiss(version=1, n_neighbors=1)

# Apply NearMiss undersampling to the dataset
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)


Check Train Set Target distribution after resampling

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Set a random seed
np.random.seed(101)

sns.set_style("whitegrid")

#subsampled_counts = y_train.value_counts().sample(n=20)  # Adjust the sample size as needed
subsampled_counts = y_train.value_counts().sample(n=min(20, y_train.nunique()))
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
ax = subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')
ax.tick_params(axis='x', rotation=45)# Rotate x-axis labels
plt.show()

### Grid Search CV - Sklearn
#### Use standard hyperparameters to find most suitable algorithm

In [None]:
models_search = {
    "LinearRegression": LinearRegression(),
    "XGBRegressor": XGBRegressor(random_state=101),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=101),
    "RandomForestRegressor": RandomForestRegressor(random_state=101),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=101),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=101),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=101),#, algorithm='SAMME'
}

params_search = {
    "LinearRegression": {},
    "XGBRegressor": {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "GradientBoostingRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
}


In [None]:
print(y_train.unique())


Quick GridSearch CV - Binary Classifier

In [None]:
from sklearn.metrics import make_scorer, r2_score

search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train,#_mapped,
           scoring =  make_scorer(r2_score, ),
           n_jobs=-1, cv=3)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

#### Do an extensive search on the most suitable algorithm to find the best hyperparameter configuration.
Define model and parameters, for Extensive Search

In [None]:
models_search = {
    "XGBRegressor": XGBRegressor(random_state=101),
}

# Documentation to help on hyperparameter list:
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

# We will not conduct an extensive search, since the focus
# is on how to combine all knowledge in an applied project.
# In a workplace project, you may spend more time in this step
params_search = {
    "XGBRegressor": {
        'model__learning_rate': [0.1, 0.01, 0.001],
        'model__max_depth': [3, 5, 10],
    }
}


Extensive GridSearch CV - Binary Classifier

In [None]:
from sklearn.metrics import r2_score, make_scorer
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, #_mapped
           scoring =  make_scorer(r2_score,),
           n_jobs=-1, cv=3)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

Get best model name programmatically

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best clf pipeline

In [None]:
pipeline_reg = grid_search_pipelines[best_model].best_estimator_
pipeline_reg

### Assess feature importance

In [None]:
import pandas as pd

# Convert X_train to a pandas DataFrame
X_train_df = pd.DataFrame(X_train)

# Now use the .tail() method
X_train_df.tail()


* With the current model, we can assess with .features_importances_

In [None]:
import pandas as pd

# Assuming X_train is your NumPy array
X_train_df = pd.DataFrame(X_train)

# Now you can access the columns attribute
columns = X_train_df.columns

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': columns[pipeline_reg['feat_selection'].get_support()],
    'Importance': pipeline_reg['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)

# re-assign best_features order
best_features = df_feature_importance['Feature'].to_list()

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

### Evaluate Pipeline on Train and Test Sets

In [None]:
from sklearn.metrics import mean_squared_error, r2_score


def reg_performance(X_train, y_train, X_test, y_test, pipeline):
    print("#### Train Set #### \n")
    y_train_pred = pipeline.predict(X_train)
    print("Mean Squared Error (Train):", mean_squared_error(y_train, y_train_pred))
    print("R2 Score (Train):", r2_score(y_train, y_train_pred))

    print("\n#### Test Set ####\n")
    y_test_pred = pipeline.predict(X_test)
    print("Mean Squared Error (Test):", mean_squared_error(y_test, y_test_pred))
    print("R2 Score (Test):", r2_score(y_test, y_test_pred))

    from sklearn.pipeline import Pipeline


---


In [None]:
import warnings

# Suppress undefined metric warnings
warnings.filterwarnings("ignore", category=UserWarning)


Evaluation: We cross check with metrics defined at ML business case


In [None]:
reg_performance(X_train=X_train, y_train=y_train,
                 X_test=X_test, y_test=y_test,
                 pipeline=pipeline_reg,
                )

print(y_train)

## Step 3: Refit pipeline with best features
### Refit ML Pipeline and Resampling
In theory, a pipeline fitted **using only the most important features** should give the same result as the one fitted with **all variables and feature selection**.

### Rewrite ML pipeline for Data Cleaning and Feature Engineering

In [None]:
best_features

New Pipeline for DataCleaning And FeatureEngineering

In [None]:
from feature_engine.encoding import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


def PipelineDataCleaningAndFeatureEngineering():
    pipeline_base = Pipeline([

        ("Scaler", StandardScaler()),  # Scale numerical features
        ("feature_selection", SelectFromModel(RandomForestRegressor())),
    ])

    return pipeline_base



### Rewrite ML Pipeline for Modelling
Function for Pipeline optmisation

In [None]:
# Pipeline Optimization: Model
def PipelineReg(model):
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model),
    ])

    return pipeline_base

In [None]:
df.dtypes

### Split Train Test Set, considering only with best features

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target variable (y)
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

# Split the data into training and testing sets
X_train, y_train, y_train, y_test,  = train_test_split(
    X, y,
    test_size=0.2,
    random_state=101,
)

### Handle Target Imbalance

In [None]:
pipeline_data_cleaning_feat_eng = PipelineDataCleaningAndFeatureEngineering()

X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train, y_train)
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt

# Set figure size
plt.figure(figsize=(10, 6))

np.random.seed(101)

# Plot bar chart
counts = y_train.value_counts()
subsampled_counts = counts.sample(n=min(20, len(counts)))  # Subsample the counts

subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')

# Set title and labels
plt.title('Train Set Target Distribution')
plt.xlabel('Target Value')
plt.ylabel('Count')

# Rotate x-axis labels
plt.xticks(rotation=45)

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show plot
plt.show()

Use NearMiss to balance Train Set target

In [None]:
from imblearn.under_sampling import NearMiss

# Define the NearMiss undersampler
undersampler = NearMiss(version=1, n_neighbors=1)

# Apply NearMiss undersampling to the dataset
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

Check Train Set Target distribution after NearMiss

In [None]:
import matplotlib.pyplot as plt

# Set figure size
plt.figure(figsize=(10, 6))

np.random.seed(101)

# Plot bar chart
counts = y_train.value_counts()
subsampled_counts = counts.sample(n=min(20, len(counts)))  # Subsample the counts

subsampled_counts.plot(kind='bar', title='Train Set Target Distribution')

# Set title and labels
plt.title('Train Set Target Distribution')
plt.xlabel('Target Value')
plt.ylabel('Count')

# Rotate x-axis labels
plt.xticks(rotation=45)

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show plot
plt.show()

### Grid Search CV: Sklearn
Using the most suitable model from the last section and its best hyperparameter configuration.

We are using the same model from the last GridCV search

In [None]:
models_search   # XGBRegressor

And the best parameters from the last GridCV search

In [None]:
best_parameters

You will need to type in manually since the hyperparameter values have to be a list. The previous dictionary is not in this format.

In [None]:
params_search = {
    'XGBRegressor': {
        'model__learning_rate': [0.01],   # the value should be in []
        'model__max_depth': [3],  # the value should be in []
    }
}


GridSearch CV

In [None]:
import pandas as pd 
from sklearn.metrics import make_scorer, r2_score

quick_search = HyperparameterOptimizationSearch(
    models=models_search, params=params_search)
quick_search.fit(X_train, y_train,
                 scoring=make_scorer(r2_score, ),
                 n_jobs=-1, cv=3)

print(X_train.shape)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary 

Define the best clf pipeline

In [None]:
best_model = grid_search_summary.iloc[0, 0]
pipeline_reg = grid_search_pipelines[best_model].best_estimator_
pipeline_reg

### Assess feature importance

In [None]:
print("best_features:", best_features)
print("feature_importances:", pipeline_reg['model'].feature_importances_)
print("Length of best_features:", len(best_features))
print("Length of feature_importances:", len(pipeline_reg['model'].feature_importances_))


In [None]:
# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': best_features,
    'Importance': pipeline_reg['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)


# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

### Evaluate Pipeline on Train and Test Sets
Evaluation: We cross-check with metrics defined in the ML business case.


In [None]:
reg_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_reg,
                )

# Step 4: Push files to Repo
We will generate the following files

* Train set
* Test set
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* features importance plot

In [None]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_pipeline/predict_saleprice/{version}'

try:
    os.makedirs(name=file_path)
except Exception as e:
    print(e)

## Train Set

In [None]:
print(X_train.shape)
# X_train.head()

I'm not sure when or how X_train became a numpy array. but I used the pd.DataFrame() method to convert it back into a dataframe.

In [None]:
import pandas as pd

# Convert X_train to a DataFrame
X_train_df = pd.DataFrame(X_train)

# Save the DataFrame to a CSV file
X_train_df.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
X_train_df.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test Set
* note that the variables are transformed already in X_test

In [None]:
import pandas as pd

# Convert X_test to a DataFrame
X_test_df = pd.DataFrame(X_test)

# Now you can use DataFrame methods like head()
X_test_df.head()

In [None]:
print(X_test_df.shape)
X_test_df.head()

In [None]:
X_test_df.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

## ML Pipelines: Data Cleaning and Feat Eng pipeline and Modelling Pipeline
We will save 2 pipelines:

* Both should be used in conjunction to predict Live Data.
* To predict on Train Set, Test Set we use only pipeline_reg, since the data is already processed.
Pipeline responsible for Data Cleaning and Feature Engineering.

In [None]:
pipeline_data_cleaning_feat_eng

In [None]:
joblib.dump(value=pipeline_reg ,
            filename=f"{file_path}/clf_pipeline_model.pkl")

### Feature Importance plot

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.show()

In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')