In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import os
import sys

## Load models

In [36]:
def load_object(file_path):
    
    with open(file_path, "rb") as file_obj:
        return pickle.load(file_obj)

filepath = "./models/"

RF_pre_trained_model = load_object(filepath + f'RandomForestRegressorModel.pkl')
XGB_pre_trained_model = load_object(filepath + f'XGRegressorModel_v2.pkl')
preprocessor = load_object(filepath + f'preprocessor.pkl')

In [3]:
def save_object(file_path, obj):
    
    dir_path = os.path.dirname(file_path)

    os.makedirs(dir_path, exist_ok=True)

    with open(file_path, "wb") as file_obj:
        pickle.dump(obj, file_obj)


## Load and process data

In [4]:
def feature_engineering(df):

    # Calculate the beta and alpha values
    df['beta'] = df['depth'] / 100
    df['alpha'] = (1 - df['beta']) * (1 + (df['table'] / 100)**2)

    # Calculate the volume of the diamond
    df['volume'] = 0.5 * df['z'] * df['x'] * df['y'] * (df['alpha'] + df['beta'])

    # Calculate the density of the diamond
    df['density'] = df['carat'] / df['volume']

    # Drop the auxiliary columns
    df.drop(['beta', 'alpha'], axis=1, inplace=True)

    return df

In [5]:
def removing_outliers(df):
    
    # Define the conditions for removing outliers
    conditions = [
        (df['carat'] > 0) & (df['price'] < 100),
        (df['z'] > 2) & (df['price'] < 100),
        (df['z'] < 2),
        (df['y'] > 3) & (df['price'] < 100),
        (df['y'] < 2),
        (df['x'] > 2) & (df['price'] < 100),
        (df['x'] < 2),
        (df['table'] > 75),
        (df['depth'] < 50),
        (df['density'] < 0.008)

    ]

    # Create a mask for the rows to be removed
    mask = np.any(conditions, axis=0)

    # Drop the rows that meet the conditions
    df = df[~mask]
    return df

In [6]:
def drop_redundant_features(df,redundant_features = ['x', 'y', 'z', 'density']):
    
    df = df.drop(redundant_features, axis=1)
    
    return df  

In [7]:
def preprocess_data(df,preprocessor,
                    numeric_features = ['volume', 'carat', 'depth', 'table'],
                    categorical_features = ['color', 'cut', 'clarity'],
                    target = 'price'):

    # Adding Features
    df = feature_engineering(df)

    # Removing Outliers
    df = removing_outliers(df)

    # Drop redundant features
    df = drop_redundant_features(df) 

    # Preprocess the data
    X_new = df.drop(target, axis=1)
    y_new = df[target]
    X_new_preprocessed = preprocessor.transform(X_new)

    return X_new_preprocessed , y_new

In [32]:
numeric_features = ['volume', 'carat', 'depth', 'table']
categorical_features = ['color', 'cut', 'clarity']
target = 'price'


# Load the new data
filepath = '../datasets/diamonds/'
fresh_data = pd.read_csv(filepath + 'fresh_data.csv')

# Process data
X_new_preprocessed , y_new = preprocess_data(   df = fresh_data,
                                                preprocessor = preprocessor,
                                                numeric_features = numeric_features,
                                                categorical_features = categorical_features,
                                                target = target)



'\n#  Fine-tune the XGBoost model on the new data\ndnew = xgb.DMatrix(X_new_preprocessed, label=y_new)\n\n# Update the model by training on new data\nparams = XGB_pre_trained_model.get_xgb_params()\nupdate_model = xgb.train(  params, dnew,\n                                   num_boost_round=XGB_pre_trained_model.get_num_boosting_rounds(),\n                                     xgb_model=XGB_pre_trained_model)\n\n\n'

### Fine Tune the model on the fresh data

In this case, only the XGRegressor model is partially fit over the fresh data, as the RandomForest Regressor is not compatible with partial fitting.

NOTE: In case, it is really needed to incrementally train the RandomForest Regressor model on the new data, there exist the posibility of employing [alternative libraries]() which expand the posibilities of some regressor, allowing to train them incrementally.

In [37]:
import xgboost as xgb

# Fine-tune the XGBoost model on the new data
dnew = xgb.DMatrix(X_new_preprocessed, label=y_new)

# Update the model by training on new data
#params = XGB_pre_trained_model.get_xgb_params()
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.1,
}
# Set the number of boosting rounds to 0 to perform partial training
params['nrounds'] = 0

# Update the existing model
updated_model = xgb.train(  params, dnew, xgb_model=XGB_pre_trained_model, num_boost_round=0)

Parameters: { "nrounds" } are not used.



### Model Evaluation

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predictions on the test set
rf_predictions = RF_pre_trained_model.predict(X_new_preprocessed)
xgb_predictions = updated_model.predict(dnew)

# Evaluate Random Forest model
rf_mae = mean_absolute_error(y_new, rf_predictions)
rf_mse = mean_squared_error(y_new, rf_predictions)
rf_rmse = mean_squared_error(y_new, rf_predictions, squared=False)
rf_r2 = r2_score(y_new, rf_predictions)

print("Random Forest Metrics:")
print(f"Mean Absolute Error (MAE): {rf_mae:.2f}")
print(f"Mean Squared Error (MSE): {rf_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rf_rmse:.2f}")
print(f"R-squared (R2): {rf_r2:.2f}")
print()

# Evaluate XGBoost model
xgb_mae = mean_absolute_error(y_new, xgb_predictions)
xgb_mse = mean_squared_error(y_new, xgb_predictions)
xgb_rmse = mean_squared_error(y_new, xgb_predictions, squared=False)
xgb_r2 = r2_score(y_new, xgb_predictions)

print("XGBoost Metrics:")
print(f"Mean Absolute Error (MAE): {xgb_mae:.2f}")
print(f"Mean Squared Error (MSE): {xgb_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {xgb_rmse:.2f}")
print(f"R-squared (R2): {xgb_r2:.2f}")

Random Forest Metrics:
Mean Absolute Error (MAE): 674.95
Mean Squared Error (MSE): 2170689.76
Root Mean Squared Error (RMSE): 1473.33
R-squared (R2): 0.92

XGBoost Metrics:
Mean Absolute Error (MAE): 514.18
Mean Squared Error (MSE): 2214855.52
Root Mean Squared Error (RMSE): 1488.24
R-squared (R2): 0.92


### Save the updated model

In [35]:
# Save regressors
filepath = "./models/"
save_object(filepath + f'XGRegressorModel_v2.pkl', updated_model)

## Pipeline Base version

In [None]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import os
import sys

# Define the model and preprocessor paths
model_path = "./models/"
preprocessor_path = "./models/preprocessor.pkl"

def train_pipeline():
    # Load the pretrained model and preprocessor
    XGB_pre_trained_model = load_object(model_path + "XGRegressorModel_v2.pkl")
    preprocessor = load_object(preprocessor_path)

    # Load fresh data
    fresh_data_path = "../datasets/diamonds/fresh_data.csv"
    fresh_data = pd.read_csv(fresh_data_path)

    # Process the fresh data
    X_new_preprocessed, y_new = preprocess_data(fresh_data, preprocessor)

    # Fine-tune the XGBoost model on the new data
    dnew = xgb.DMatrix(X_new_preprocessed, label=y_new)
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': 0.1,
    }
    params['nrounds'] = 0
    updated_model = xgb.train(params, dnew, xgb_model=XGB_pre_trained_model, num_boost_round=0)

    # Overwrite the old model with the updated model
    save_object(model_path + f'XGRegressorModel_v2.pkl', updated_model)

    
    return None

## Additional aspect to consider

+ Log the pipeline execution to monitor performance and any potential errors.
+ Implement a validation step to evaluate the performance of the updated model.

#### Pipeline v2 

This version adds a log file and model evaluation.

In [39]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import os
import sys
import logging
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set up logging
logging.basicConfig(filename='train_pipeline.log', level=logging.INFO)

# Define the model and preprocessor paths
model_path = "./models/"
model_name = "XGRegressorModel_v2"
preprocessor_path = "./models/preprocessor.pkl"

fresh_data_path = "../datasets/diamonds/fresh_data.csv"

def train_pipeline( model_path = "./models/",
                    model_name = "XGRegressorModel_v2",
                    preprocessor_path = "./models/preprocessor.pkl",
                    fresh_data_path = "../datasets/diamonds/fresh_data.csv"):
    logging.info("Starting pipeline execution")

    # Load the pretrained model and preprocessor
    XGB_pre_trained_model = load_object(model_path + model_name + ".pkl")
    preprocessor = load_object(preprocessor_path)

    # Load fresh data
    fresh_data = pd.read_csv(fresh_data_path)

    # Process the fresh data
    X_new_preprocessed, y_new = preprocess_data(fresh_data, preprocessor)

    # Fine-tune the XGBoost model on the new data
    dnew = xgb.DMatrix(X_new_preprocessed, label=y_new)
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': 0.1,
    }
    params['nrounds'] = 0
    updated_model = xgb.train(params, dnew, xgb_model=XGB_pre_trained_model, num_boost_round=0)

    # Evaluate the performance of the updated model
    logging.info("Evaluating model performance")
    xgb_predictions = updated_model.predict(dnew)
    xgb_rmse = mean_squared_error(y_new, xgb_predictions, squared=False)
    xgb_r2 = r2_score(y_new, xgb_predictions)

    logging.info("XGBoost Metrics:")
    logging.info(f"Root Mean Squared Error (RMSE): {xgb_rmse:.2f}")
    logging.info(f"R-squared (R2): {xgb_r2:.2f}")
    
    # Overwrite the old model with the updated model
    logging.info("Saving updated model")
    save_object(model_path + f'XGRegressorModel_v2.pkl', updated_model)

    logging.info("Pipeline execution complete")

    return None

In [40]:
train_pipeline()

Parameters: { "nrounds" } are not used.



#### Pipeline v3 

This version improves the format of the log file.

In [None]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import os
import sys
import logging
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler(filename='pipeline.log', mode='a')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Define the model and preprocessor paths
model_path = "./models/"
model_name = "XGRegressorModel_v2"
preprocessor_path = "./models/preprocessor.pkl"

fresh_data_path = "../datasets/diamonds/fresh_data.csv"

def train_pipeline( model_path = "./models/",
                    model_name = "XGRegressorModel_v2",
                    preprocessor_path = "./models/preprocessor.pkl",
                    fresh_data_path = "../datasets/diamonds/fresh_data.csv"):
    logger.info('Starting pipeline execution')

    # Load the pretrained model and preprocessor
    logger.info('Loading pretrained model and preprocessor')
    XGB_pre_trained_model = load_object(model_path + model_name + ".pkl")
    preprocessor = load_object(preprocessor_path)

    # Load fresh data
    fresh_data = pd.read_csv(fresh_data_path)

    # Process the fresh data
    logger.info('Preprocessing fresh data')
    X_new_preprocessed, y_new = preprocess_data(fresh_data, preprocessor)

    # Fine-tune the XGBoost model on the new data
    logger.info('Fine-tuning XGBoost model')
    dnew = xgb.DMatrix(X_new_preprocessed, label=y_new)
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': 0.1,
    }
    params['nrounds'] = 0
    updated_model = xgb.train(params, dnew, xgb_model=XGB_pre_trained_model, num_boost_round=0)

    # Evaluate the performance of the updated model
    logger.info("Evaluating model performance")
    xgb_predictions = updated_model.predict(dnew)
    xgb_rmse = mean_squared_error(y_new, xgb_predictions, squared=False)
    xgb_r2 = r2_score(y_new, xgb_predictions)

    logger.info("XGBoost Metrics:")
    logger.info(f"Root Mean Squared Error (RMSE): {xgb_rmse:.2f}")
    logger.info(f"R-squared (R2): {xgb_r2:.2f}")
    
    # Overwrite the old model with the updated model
    logger.info('Overwriting old model with updated model')
    save_object(model_path + f'XGRegressorModel_v2.pkl', updated_model)

    logger.info('Pipeline execution completed')

    return None

In [None]:
train_pipeline()