In [43]:
import joblib
import os
import logging
import datetime
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor

In [44]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
def load_data(file):
    """
    Load and preprocess the data.
    """
    diamonds = pd.read_csv(file)
    diamonds = diamonds[(diamonds.x * diamonds.y * diamonds.z != 0) & (diamonds.price > 0)]
    diamonds_dummy = pd.get_dummies(diamonds, columns=['cut', 'color', 'clarity'], drop_first=True)

    return diamonds_dummy

In [3]:
data_file = "https://raw.githubusercontent.com/xtreamsrl/xtream-ai-assignment-engineer/main/datasets/diamonds/diamonds.csv"
data = load_data(data_file) 
data.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Good,cut_Ideal,cut_Premium,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,1.1,62.0,55.0,4733,6.61,6.65,4.11,False,True,False,...,True,False,False,False,False,True,False,False,False,False
1,1.29,62.6,56.0,6424,6.96,6.93,4.35,False,True,False,...,True,False,False,False,True,False,False,False,False,False
2,1.2,61.1,58.0,5510,6.88,6.8,4.18,False,False,True,...,False,True,False,False,True,False,False,False,False,False
3,1.5,60.9,56.0,8770,7.43,7.36,4.5,False,True,False,...,False,False,False,False,True,False,False,False,False,False
4,0.9,61.7,57.0,4493,6.17,6.21,3.82,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [4]:
def split_data(data):
    """
    Split data into training and testing sets.
    """
    x = data.drop(columns='price')
    y = data.price
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    return x_train, x_test, y_train, y_test

In [5]:
x_train, x_test, y_train, y_test = split_data(data)

In [45]:
def train_model(model, x_train, y_train):
    """
    Train the model.
    """
    try:
        model.fit(x_train, y_train)
        logging.info(f"Training completed successfully for {type(model).__name__}")

        return model
    
    except Exception as e:
        logging.error(f"Error training model {type(model).__name__}: {str(e)}")
        
        return None


In [46]:
def evaluate_model(model, x_test, y_test):
    """
    Evaluate the model using R2 and MAE.
    """
    try:
        predictions = model.predict(x_test)
        r2 = r2_score(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        logging.info(f"Evaluation metrics calculated successfully for {type(model).__name__}")

        return {"R2 Score": r2, "MAE": mae}
    
    except Exception as e:
        logging.error(f"Error evaluating model {type(model).__name__}: {str(e)}")

        return None

In [34]:
def save_model(model, performance_metrics, model_name):
    """
    Save the model and its performance metrics with a timestamp in a txt file.
    """
    timestamp = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
    model_directory = f"models/{model_name}/{timestamp}"

    os.makedirs(model_directory, exist_ok=True)
    joblib.dump(model, f"{model_directory}/model.joblib")

    with open(f"{model_directory}/metrics.txt", 'w') as f:
        f.write(str(performance_metrics))

In [39]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "LinearSVR": LinearSVR(),
    "SGDRegressor": SGDRegressor(),
}

In [47]:
for model_name, model in models.items():
    logging.info(f"Starting training for {model_name}")
    trained_model = train_model(model, x_train, y_train)
    if trained_model:
        metrics = evaluate_model(trained_model, x_test, y_test)
        if metrics:
            save_model(trained_model, metrics, model_name)
            logging.info(f"{model_name} - R2 Score: {metrics['R2 Score']}, MAE: {metrics['MAE']}")
        else:
            logging.error(f"Failed to evaluate {model_name}")
    else:
        logging.error(f"Failed to train {model_name}")

2024-07-02 21:29:07,524 - INFO - Starting training for LinearRegression
2024-07-02 21:29:07,529 - INFO - Training completed successfully for LinearRegression
2024-07-02 21:29:07,530 - INFO - Evaluation metrics calculated successfully for LinearRegression
2024-07-02 21:29:07,532 - INFO - LinearRegression - R2 Score: 0.9090798105116309, MAE: 756.7317987073
2024-07-02 21:29:07,532 - INFO - Starting training for Ridge
2024-07-02 21:29:07,534 - INFO - Training completed successfully for Ridge
2024-07-02 21:29:07,536 - INFO - Evaluation metrics calculated successfully for Ridge
2024-07-02 21:29:07,537 - INFO - Ridge - R2 Score: 0.9089438362670546, MAE: 756.1527571184614
2024-07-02 21:29:07,537 - INFO - Starting training for Lasso
2024-07-02 21:29:07,629 - INFO - Training completed successfully for Lasso
2024-07-02 21:29:07,631 - INFO - Evaluation metrics calculated successfully for Lasso
2024-07-02 21:29:07,633 - INFO - Lasso - R2 Score: 0.9091178731666789, MAE: 750.4417062951571
2024-07-02 