In [15]:
import joblib
import os
import csv
import logging
import datetime
import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor
import xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
def load_data(file):
    """
    Load and preprocess data from a CSV file.
    
    Parameters:
        file (str): URL or local path to the CSV file containing data.
    
    Returns:
        pd.DataFrame: Preprocessed DataFrame with one-hot encoded categorical variables and filtered invalid entries.
    """
    diamonds = pd.read_csv(file)
    diamonds = diamonds[(diamonds.x * diamonds.y * diamonds.z != 0) & (diamonds.price > 0)]

    return diamonds

In [25]:
def preprocess_data(diamonds, model_type='linear'):
    """
    Preprocesses data according to the model type specified. This function handles categorical encoding differently based on whether 
    the data is intended for linear models or tree-based models.

    For linear models, it applies one-hot encoding and drops the first category. For tree-based models, it converts categorical variables 
    to a pandas Categorical datatype and applies one-hot encoding without dropping the first category.

    Parameters:
        diamonds (pd.DataFrame): The DataFrame containing diamond data with columns that include 'cut', 'color', and 'clarity'.
        model_type (str, optional): The type of model the data is being prepared for. Expected values are 'linear' or 'tree'. Defaults to 'linear'.

    Returns:
        pd.DataFrame: The DataFrame with categorical variables encoded according to the model type. If 'linear' is chosen, dummy variables are created 
        for categorical features and the first dummy is dropped to prevent collinearity. If 'tree' is chosen, dummy variables are created for all 
        categories of each categorical feature.
    """
    if model_type == 'linear':
        return pd.get_dummies(diamonds, columns=['cut', 'color', 'clarity'], drop_first=True)
    elif model_type == 'tree':
        for col in ['cut', 'color', 'clarity']:
            diamonds[col] = pd.Categorical(diamonds[col])
        return pd.get_dummies(diamonds, columns=['cut', 'color', 'clarity'], drop_first=False)
    return diamonds

In [5]:
data_file = "https://raw.githubusercontent.com/xtreamsrl/xtream-ai-assignment-engineer/main/datasets/diamonds/diamonds.csv"
data = load_data(data_file) 

data_linear = preprocess_data(data.copy(), 'linear')
data_tree = preprocess_data(data.copy(), 'tree')

data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.1,Ideal,H,SI2,62.0,55.0,4733,6.61,6.65,4.11
1,1.29,Ideal,H,SI1,62.6,56.0,6424,6.96,6.93,4.35
2,1.2,Premium,I,SI1,61.1,58.0,5510,6.88,6.8,4.18
3,1.5,Ideal,F,SI1,60.9,56.0,8770,7.43,7.36,4.5
4,0.9,Very Good,F,VS2,61.7,57.0,4493,6.17,6.21,3.82


In [6]:
def split_data(data):
    """
    Split the data into training and testing sets.
    
    Parameters:
        data (pd.DataFrame): The DataFrame to split.
    
    Returns:
        tuple: Contains training and testing datasets (x_train, x_test, y_train, y_test).
    """
    x = data.drop(columns='price')
    y = data.price
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    return x_train, x_test, y_train, y_test

In [12]:
x_train_linear, x_test_linear, y_train_linear, y_test_linear = split_data(data_linear)
x_train_tree, x_test_tree, y_train_tree, y_test_tree = split_data(data_tree)

In [8]:
def train_model(model, x_train, y_train):
    """
    Train a machine learning model.
    
    Parameters:
        model: The machine learning model to be trained.
        x_train (pd.DataFrame): Training data features.
        y_train (pd.Series): Training data labels.
    
    Returns:
        model: Trained model.
    """
    try:
        model.fit(x_train, y_train)
        logging.info(f"Training completed successfully for {type(model).__name__}")

        return model
    
    except Exception as e:
        logging.error(f"Error training model {type(model).__name__}: {str(e)}", exc_info=True)
        
        return None


In [9]:
def evaluate_model(model, x_test, y_test):
    """
    Evaluate a machine learning model using R2 and MAE.
    
    Parameters:
        model: The trained machine learning model to evaluate.
        x_test (pd.DataFrame): Testing data features.
        y_test (pd.Series): Testing data labels.
    
    Returns:
        dict: Dictionary containing evaluation metrics.
    """
    try:
        predictions = model.predict(x_test)
        r2 = r2_score(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        logging.info(f"Evaluation metrics calculated successfully for {type(model).__name__}")

        return {"R2 Score": r2, "MAE": mae}
    
    except Exception as e:
        logging.error(f"Error evaluating model {type(model).__name__}: {str(e)}", exc_info=True)

        return None

In [10]:
def save_model(model, performance_metrics, model_name):
    """
    Save the model and its performance metrics with a timestamp in a txt file.
    """
    timestamp = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
    model_directory = f"models/{model_name}/{timestamp}"

    os.makedirs(model_directory, exist_ok=True)
    joblib.dump(model, f"{model_directory}/model.joblib")

    with open(f"{model_directory}/metrics.txt", 'w') as f:
        f.write(str(performance_metrics))

In [11]:
def log_metrics(metrics, model_name, filename="model_performance.csv"):
    """
    Log model performance metrics to a CSV file.
    
    Parameters:
        metrics (dict): Dictionary containing the performance metrics.
        model_name (str): Name of the model.
        filename (str): Name of the CSV file to log the metrics.
    """
    file_exists = os.path.isfile(filename)
    with open(filename, 'a', newline='') as csvfile:
        fieldnames = ['model_name', 'timestamp', 'R2_score', 'MAE']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        if not file_exists:
            writer.writeheader()

        metrics_row = {
            'model_name': model_name,
            'timestamp': datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S"),
            'R2_score': metrics['R2 Score'],
            'MAE': metrics['MAE']
        }

        writer.writerow(metrics_row)

In [22]:
def optimize_hyperparameters(x_train, y_train, n_trials=100):
    """
    Perform hyperparameter optimization using Optuna.

    Parameters:
        x_train, y_train (DataFrame, Series): Training data.
        n_trials (int): Number of optimization trials.

    Returns:
        dict: Best model parameters found.
    """
    def objective(trial):
        param = {
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.7]),
            'subsample': trial.suggest_categorical('subsample', [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.2),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'random_state': 42,
            'objective': 'reg:squarederror'
        }
        model = xgboost.XGBRegressor(**param)
        model.fit(x_train, y_train, eval_set=[(x_train, y_train)], verbose=False)
        preds = model.predict(x_train)
        mae = mean_absolute_error(y_train, preds)
        
        return mae

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params

In [17]:
models = {
    "LinearRegression": {"model": LinearRegression(), "type": "linear", "optimize": False},
    "Ridge": {"model": Ridge(), "type": "linear", "optimize": False},
    "Lasso": {"model": Lasso(), "type": "linear", "optimize": False},
    "ElasticNet": {"model": ElasticNet(), "type": "linear", "optimize": False},
    "LinearSVR": {"model": LinearSVR(), "type": "linear", "optimize": False},
    "SGDRegressor": {"model": SGDRegressor(), "type": "linear", "optimize": False},
    "XGBoost": {"model": xgboost.XGBRegressor(enable_categorical=True, random_state=42), "type": "tree", "optimize": True}
}

In [23]:
for model_name, config in models.items():
    model = config["model"]
    model_type = config["type"]
    should_optimize = config["optimize"]

    if model_type == "linear":
        x_train, x_test, y_train, y_test = x_train_linear, x_test_linear, y_train_linear, y_test_linear
    elif model_type == "tree":
        x_train, x_test, y_train, y_test = x_train_tree, x_test_tree, y_train_tree, y_test_tree

    logging.info(f"Starting training for {model_name}")
    
    if should_optimize:
        logging.info(f"Optimizing hyperparameters for {model_name}")
        best_params = optimize_hyperparameters(x_train, y_train)
        model.set_params(**best_params)

    trained_model = train_model(model, x_train, y_train)
    
    if trained_model:
        metrics = evaluate_model(trained_model, x_test, y_test)
        if metrics:
            save_model(trained_model, metrics, model_name)
            log_metrics(metrics, model_name)
            logging.info(f"{model_name} - R2 Score: {metrics['R2 Score']}, MAE: {metrics['MAE']}")
        else:
            logging.error(f"Failed to evaluate {model_name}", exc_info=True)
    else:
        logging.error(f"Failed to train {model_name}", exc_info=True)

2024-07-03 00:45:16,099 - INFO - Starting training for LinearRegression
2024-07-03 00:45:16,106 - INFO - Training completed successfully for LinearRegression
2024-07-03 00:45:16,108 - INFO - Evaluation metrics calculated successfully for LinearRegression
2024-07-03 00:45:16,113 - INFO - LinearRegression - R2 Score: 0.9090798105116309, MAE: 756.7317987073
2024-07-03 00:45:16,114 - INFO - Starting training for Ridge
2024-07-03 00:45:16,118 - INFO - Training completed successfully for Ridge
2024-07-03 00:45:16,119 - INFO - Evaluation metrics calculated successfully for Ridge
2024-07-03 00:45:16,120 - INFO - Ridge - R2 Score: 0.9089438362670546, MAE: 756.1527571184614
2024-07-03 00:45:16,120 - INFO - Starting training for Lasso
  model = cd_fast.enet_coordinate_descent(
2024-07-03 00:45:16,195 - INFO - Training completed successfully for Lasso
2024-07-03 00:45:16,197 - INFO - Evaluation metrics calculated successfully for Lasso
2024-07-03 00:45:16,199 - INFO - Lasso - R2 Score: 0.909117873

---

In [50]:
help(load_data)

Help on function load_data in module __main__:

load_data(file)
    Load and preprocess data from a CSV file.

    Parameters:
        file (str): URL or local path to the CSV file containing data.

    Returns:
        pd.DataFrame: Preprocessed DataFrame with one-hot encoded categorical variables and filtered invalid entries.



In [52]:
help(split_data)

Help on function split_data in module __main__:

split_data(data)
    Split the data into training and testing sets.

    Parameters:
        data (pd.DataFrame): The DataFrame to split.

    Returns:
        tuple: Contains training and testing datasets (x_train, x_test, y_train, y_test).



In [54]:
help(train_model)

Help on function train_model in module __main__:

train_model(model, x_train, y_train)
    Train a machine learning model.

    Parameters:
        model (estimator): The machine learning model to be trained.
        x_train (pd.DataFrame): Training data features.
        y_train (pd.Series): Training data labels.

    Returns:
        model: Trained model.



In [56]:
help(evaluate_model)

Help on function evaluate_model in module __main__:

evaluate_model(model, x_test, y_test)
    Evaluate a machine learning model using R2 and MAE.

    Parameters:
        model: The trained machine learning model to evaluate.
        x_test (pd.DataFrame): Testing data features.
        y_test (pd.Series): Testing data labels.

    Returns:
        dict: Dictionary containing evaluation metrics.



In [26]:
help(preprocess_data)

Help on function preprocess_data in module __main__:

preprocess_data(diamonds, model_type='linear')
    Preprocesses data according to the model type specified. This function handles categorical encoding differently based on whether
    the data is intended for linear models or tree-based models.

    For linear models, it applies one-hot encoding and drops the first category. For tree-based models, it converts categorical variables
    to a pandas Categorical datatype and applies one-hot encoding without dropping the first category.

    Parameters:
        diamonds (pd.DataFrame): The DataFrame containing diamond data with columns that include 'cut', 'color', and 'clarity'.
        model_type (str, optional): The type of model the data is being prepared for. Expected values are 'linear' or 'tree'. Defaults to 'linear'.

    Returns:
        pd.DataFrame: The DataFrame with categorical variables encoded according to the model type. If 'linear' is chosen, dummy variables are created
 

In [27]:
help(optimize_hyperparameters)

Help on function optimize_hyperparameters in module __main__:

optimize_hyperparameters(x_train, y_train, n_trials=100)
    Perform hyperparameter optimization using Optuna.

    Parameters:
        x_train, y_train (DataFrame, Series): Training data.
        n_trials (int): Number of optimization trials.

    Returns:
        dict: Best model parameters found.

