In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [2]:
filepath = '../datasets/diamonds/diamonds.csv'

## Data Preprocessing

Based on the findings of the EDA stage, I have determined that the following features are the more relevant:
Numeric features:
+ Volume
+ Carat
+ depth and table (in second order)

Categorical features:
+ Color
+ Cut
+ Clarity


Then, I need to cover the following steps:

+ Drop redundant features (x,y,z, density)
+ Preprocess numeric features (volume, carat, depth and table)
+ Preprocess (encode) categorical features (color, cut and Clarity)

In [3]:
# Load the dataset
diamonds = pd.read_csv(filepath)

In [4]:

# Calculate the beta and alpha values
diamonds['beta'] = diamonds['depth'] / 100
diamonds['alpha'] = (1 - diamonds['beta']) * (1 + (diamonds['table'] / 100)**2)

# Calculate the volume of the diamond
diamonds['volume'] = 0.5 * diamonds['z'] * diamonds['x'] * diamonds['y'] * (diamonds['alpha'] + diamonds['beta'])

# Calculate the density of the diamond
diamonds['density'] = diamonds['carat'] / diamonds['volume']

# Drop the auxiliary columns
diamonds.drop(['beta', 'alpha'], axis=1, inplace=True)


In [5]:
# Define the conditions for removing outliers
conditions = [
    (diamonds['carat'] > 0) & (diamonds['price'] < 100),
    (diamonds['z'] > 2) & (diamonds['price'] < 100),
    (diamonds['z'] < 2),
    (diamonds['y'] > 3) & (diamonds['price'] < 100),
    (diamonds['y'] < 2),
    (diamonds['x'] > 2) & (diamonds['price'] < 100),
    (diamonds['x'] < 2),
    (diamonds['table'] > 75),
    (diamonds['depth'] < 50),
    (diamonds['density'] < 0.008)

]

# Create a mask for the rows to be removed
mask = np.any(conditions, axis=0)

# Drop the rows that meet the conditions
diamonds = diamonds[~mask]

# Save the cleaned dataset
#diamonds.to_csv('diamonds_cleaned.csv', index=False)

diamonds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4985 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    4985 non-null   float64
 1   cut      4985 non-null   object 
 2   color    4985 non-null   object 
 3   clarity  4985 non-null   object 
 4   depth    4985 non-null   float64
 5   table    4985 non-null   float64
 6   price    4985 non-null   int64  
 7   x        4985 non-null   float64
 8   y        4985 non-null   float64
 9   z        4985 non-null   float64
 10  volume   4985 non-null   float64
 11  density  4985 non-null   float64
dtypes: float64(8), int64(1), object(3)
memory usage: 506.3+ KB


In [None]:

# Drop redundant features
redundant_features = ['x', 'y', 'z', 'density']
diamonds = diamonds.drop(redundant_features, axis=1)


### Preprocessor

In [7]:
# Extract relevant features
numeric_features = ['volume', 'carat', 'depth', 'table']
categorical_features = ['color', 'cut', 'clarity']
target = 'price'

# Preprocess numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocess categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


## Training the Models

The following are regression models to consider:

1. **Random Forest Regressor:** This model is an ensemble of decision trees and is robust to noisy data. It can handle non-linear relationships well.

2. **Gradient Boosting Regressor:** This is another ensemble method that builds trees sequentially, with each tree correcting the errors of the previous one. It generally performs well on a wide range of datasets.

3. **XGBoost Regressor:** An optimized implementation of gradient boosting that often yields high performance. It's efficient and can handle missing data well.

4. **LASSO Regression (L1 Regularization):** If there are redundant features or multicollinearity, LASSO regression can help by penalizing less important features and shrinking their coefficients to zero.

5. **Elastic Net Regression:** This model combines L1 and L2 regularization, providing a balance between feature selection (like LASSO) and handling correlated features (like Ridge).

At early stages, it's a good idea to try a few different models and compare their performance using cross-validation or a holdout validation set.

In [8]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    diamonds.drop(target, axis=1), diamonds[target], test_size=0.2, random_state=42)


In [9]:
# Preprocessing of training data, train model
preprocessor.fit(X_train, y_train)

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
import numpy as np

# Preprocess the test data using the preprocessor
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)
# Define models
models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor(),
    'LASSO Regression':  Lasso(),
    'Elastic Net Regression':  ElasticNet(),
    'Decision Tree': DecisionTreeRegressor()
}

# Evaluate models
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-cv_scores)
    print(f"{name} RMSE: {np.mean(rmse_scores):.4f} (std: {np.std(rmse_scores):.4f})")
    model.fit(X_train_processed, y_train)

# Choose the best model based on cross-validation performance
best_model_name = min(models, key=lambda k: np.mean(np.sqrt(-cross_val_score(models[k], X_train_processed, y_train, cv=5, scoring='neg_mean_squared_error'))))
best_model = models[best_model_name]

# Train the best model on the full training set
best_model.fit(X_train_processed, y_train)

# Evaluate the best model on the test set
test_predictions = best_model.predict(X_test_processed)
test_rmse = np.sqrt(np.mean((test_predictions - y_test)**2))
print(f"\nBest Model ({best_model_name}) Test RMSE: {test_rmse:.4f}")


ModuleNotFoundError: No module named 'xgboost'

#### Feature Relevance Interpretation

In [None]:
# Considering the XGBoost Regressor model
# Get feature names from the preprocessor
feature_names = preprocessor.get_feature_names_out()

# Get feature importances from the XGBoost model
feature_importances = best_model.feature_importances_

# Create a dictionary of feature names and importance scores
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Sort the dictionary by importance scores in descending order
sorted_feature_importance_dict = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Print the feature names and their importance scores
for feature, importance in sorted_feature_importance_dict.items():
    print(f"{feature}: {importance}")

In [None]:
# Considering the Decision Tree model
model = models['Decision Tree']
# Get feature importances from the Decision Tree model
feature_importances = model.feature_importances_

# Create a dictionary to store feature names and importances
feature_dict = {feature: importance for feature, importance in zip(feature_names, feature_importances)}

# Sort the dictionary in descending order of importance
sorted_features = sorted(feature_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted features and their importances
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")

In [None]:
# Considering the Random Forest model
model = models['Random Forest']
#  Get feature importances from the Decision Tree model
feature_importances = model.feature_importances_

# Create a dictionary to store feature names and importances
feature_dict = {feature: importance for feature, importance in zip(feature_names, feature_importances)}

# Sort the dictionary in descending order of importance
sorted_features = sorted(feature_dict.items(), key=lambda x: x[1], reverse=True)

# Print the sorted features and their importances
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")