In [6]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os
import sys

## Load models

In [7]:
def load_object(file_path):
    
    with open(file_path, "rb") as file_obj:
        return pickle.load(file_obj)

filepath = "./models/"

RF_pre_trained_model = load_object(filepath + f'RandomForestRegressorModel.pkl')
XGB_pre_trained_model = load_object(filepath + f'XGRegressorModel.pkl')
preprocessor = load_object(filepath + f'preprocessor.pkl')

In [8]:
def save_object(file_path, obj):
    
    dir_path = os.path.dirname(file_path)

    os.makedirs(dir_path, exist_ok=True)

    with open(file_path, "wb") as file_obj:
        pickle.dump(obj, file_obj)


## Load and process data

In [9]:
def feature_engineering(df):

    # Calculate the beta and alpha values
    df['beta'] = df['depth'] / 100
    df['alpha'] = (1 - df['beta']) * (1 + (df['table'] / 100)**2)

    # Calculate the volume of the diamond
    df['volume'] = 0.5 * df['z'] * df['x'] * df['y'] * (df['alpha'] + df['beta'])

    # Calculate the density of the diamond
    df['density'] = df['carat'] / df['volume']

    # Drop the auxiliary columns
    df.drop(['beta', 'alpha'], axis=1, inplace=True)

    return df

In [10]:
def removing_outliers(df):
    
    # Define the conditions for removing outliers
    conditions = [
        (df['carat'] > 0) & (df['price'] < 100),
        (df['z'] > 2) & (df['price'] < 100),
        (df['z'] < 2),
        (df['y'] > 3) & (df['price'] < 100),
        (df['y'] < 2),
        (df['x'] > 2) & (df['price'] < 100),
        (df['x'] < 2),
        (df['table'] > 75),
        (df['depth'] < 50),
        (df['density'] < 0.008)

    ]

    # Create a mask for the rows to be removed
    mask = np.any(conditions, axis=0)

    # Drop the rows that meet the conditions
    df = df[~mask]
    return df

In [11]:
def drop_redundant_features(df,redundant_features = ['x', 'y', 'z', 'density']):
    
    df = df.drop(redundant_features, axis=1)
    
    return df  

In [12]:
def preprocess_data(df,preprocessor,
                    numeric_features = ['volume', 'carat', 'depth', 'table'],
                    categorical_features = ['color', 'cut', 'clarity'],
                    target = 'price'):

    # Adding Features
    df = feature_engineering(df)

    # Removing Outliers
    df = removing_outliers(df)

    # Drop redundant features
    df = drop_redundant_features(df) 

    # Preprocess the data
    X_new = df.drop(target, axis=1)
    y_new = df[target]
    X_new_preprocessed = preprocessor.transform(X_new)

    return X_new_preprocessed , y_new

In [15]:
# Relevant features
numeric_features = ['volume', 'carat', 'depth', 'table']
categorical_features = ['color', 'cut', 'clarity']
target = 'price'


# Load the new data
filepath = '../datasets/diamonds/'
fresh_data = pd.read_csv(filepath + 'fresh_data.csv')

# Process data
X_new_preprocessed , y_new = preprocess_data(   df = fresh_data,
                                                preprocessor = preprocessor,
                                                numeric_features = numeric_features,
                                                categorical_features = categorical_features,
                                                target = target)


#  Fine-tune the model on the new data
RF_pre_trained_model.fit(X_new_preprocessed, y_new)
XGB_pre_trained_model.fit(X_new_preprocessed, y_new)

# Save the updated model
#filepath = "./models/"
#save_object(filepath + f'RandomForestRegressorModel.pkl', RF_pre_trained_model)
#save_object(filepath + f'XGRegressorModel.pkl', XGB_pre_trained_model)


In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predictions on the test set
rf_predictions = RF_pre_trained_model.predict(X_new_preprocessed)
xgb_predictions = XGB_pre_trained_model.predict(X_new_preprocessed)

# Evaluate Random Forest model
rf_mae = mean_absolute_error(y_new, rf_predictions)
rf_mse = mean_squared_error(y_new, rf_predictions)
rf_rmse = mean_squared_error(y_new, rf_predictions, squared=False)
rf_r2 = r2_score(y_new, rf_predictions)

print("Random Forest Metrics:")
print(f"Mean Absolute Error (MAE): {rf_mae:.2f}")
print(f"Mean Squared Error (MSE): {rf_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rf_rmse:.2f}")
print(f"R-squared (R2): {rf_r2:.2f}")
print()

# Evaluate XGBoost model
xgb_mae = mean_absolute_error(y_new, xgb_predictions)
xgb_mse = mean_squared_error(y_new, xgb_predictions)
xgb_rmse = mean_squared_error(y_new, xgb_predictions, squared=False)
xgb_r2 = r2_score(y_new, xgb_predictions)

print("XGBoost Metrics:")
print(f"Mean Absolute Error (MAE): {xgb_mae:.2f}")
print(f"Mean Squared Error (MSE): {xgb_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {xgb_rmse:.2f}")
print(f"R-squared (R2): {xgb_r2:.2f}")

Random Forest Metrics:
Mean Absolute Error (MAE): 1305.19
Mean Squared Error (MSE): 2607106.21
Root Mean Squared Error (RMSE): 1614.65
R-squared (R2): 0.90

XGBoost Metrics:
Mean Absolute Error (MAE): 0.08
Mean Squared Error (MSE): 0.03
Root Mean Squared Error (RMSE): 0.16
R-squared (R2): 1.00
