In [None]:
# Import libraries
import numpy as np
import pandas as pd
# Display all columns
pd.set_option('display.max_columns', None)

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Data statistics
import scipy.stats as stats

# Disable python warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
df = pd.read_csv('data/gemstone.csv')
# Top & Last 5 rows of data
df

In [None]:
# Drop unnecessary feature
df.drop(columns=['id'], axis= 1, inplace= True)

In [None]:
# Numerical columns
num = [col for col in df.columns if (df[col].dtypes == 'int64') or (df[col].dtypes == 'float64')]
num.pop() # Drop last dependent column
print(f"List of numerical columns:\n {num}")
print(f'Length of numerical columns: {len(num)}')

In [None]:
# Categorical columns
cat = df.select_dtypes(include='object').columns
print(f"List of categorical columns:\n {cat}")
print(f'Length of categorical columns: {len(cat)}')

# I made a few strategies to deal with data.

### 1. Data Preprocessing
### 2. Model Building, Cross-Validation, and Evaluation
### 3. Model Testing

## 1. Data Preprocessing

In [None]:
# Convert categorical data into numerical form using onehotencoding technique
from sklearn.preprocessing import OrdinalEncoder

# Apply power transform featurewise to make data more Gaussian-like.
from sklearn.preprocessing import PowerTransformer

# Applies transformers to columns of an array or pandas DataFrame.
from sklearn.compose import ColumnTransformer

In [None]:
# Categorical ColumnTransformer
cat_trf = ColumnTransformer(
    transformers= [
        ('ordinal_encoder', OrdinalEncoder(dtype='int64'), cat),
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
).set_output(transform='pandas')

In [None]:
# Convert categorical columns into numerical form
trf = cat_trf.fit_transform(df[cat])

In [None]:
# Drop ['cut', 'color', 'clarity'] columns
df.drop(columns= cat, axis=1, inplace= True)

In [None]:
# Concatenate the columns
df = pd.concat([df, trf], axis=1)

In [None]:
df

In [None]:
# Numerical Columns Plotting the distplots without any transformation
for col in df[num].columns:
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    sns.distplot(df[col])
    plt.title(col)

    plt.subplot(122)
    stats.probplot(df[col], dist="norm", plot=plt)
    plt.title(col)

    plt.show()

In [None]:
# Categorical Columns Plotting the distplots without any transformation
for col in df[cat].columns:
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    sns.distplot(df[col])
    plt.title(col)

    plt.subplot(122)
    stats.probplot(df[col], dist="norm", plot=plt)
    plt.title(col)

    plt.show()

In [None]:
# Split data into featue X and y for feature selection
X = df.drop(columns=['price'], axis=1)
y = df[['price']]

In [None]:
X.shape

In [None]:
# # Apply power transform featurewise to make data more Gaussian-like.
# # Transform features using quantiles information.
# from sklearn.preprocessing import PowerTransformer
# qwt = PowerTransformer(method='box-cox')
# qwt.set_output(transform='pandas')

In [None]:
# # Transform independent features
# x_qwt = qwt.fit_transform(X+0.00000001)

In [None]:
# # Before and after comparision
# for col in x_qwt.columns:
#     plt.figure(figsize=(14,4))
#     plt.subplot(121)
#     sns.distplot(X[col])
#     plt.title(col)

#     plt.subplot(122)
#     sns.distplot(x_qwt[col])
#     plt.title(col)

#     plt.show()

In [None]:
# x_qwt.head()

In [None]:
# For Feature selection (FS) using varianceThreshold & mutual_info_reg technique
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression

In [None]:
# Find zero variance features of x_qwt
vt = VarianceThreshold(threshold= (0.95 * (1-0.95)))
vt.fit_transform(X)

In [None]:
# Column names you gave
vt.feature_names_in_

In [None]:
# Column names you get
vt.get_feature_names_out()

In [None]:
# Variance values
vt.variances_

In [None]:
# Columns 
vt.get_support()

In [None]:
mi = mutual_info_regression(X, y)

In [None]:
imp = pd.Series(mi, X.columns[0:len(X.columns)])

In [None]:
imp.plot(kind='barh')
plt.show()

In [None]:
# Correlation between the features
df.corr()['price'].sort_values(ascending=False)*100

In [None]:
# Graph heatmap of correlation
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm', annot=True, annot_kws={'size': 12}, linewidths= .7)
plt.show()

In [None]:
# Covariance between the features
cov = np.round(df.cov()['price'],4)
cov

## 2. Model Building, Cross-Validation, and Evaluation

In [None]:
# Split arrays or matrices into random train and test subsets.
# Evaluate a score by cross-validation.
# Exhaustive search over specified parameter values for an estimator.
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

# Regressor model
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Model Evaluation
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

In [None]:
# Split data into train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Call Different Regression Models
elastinet = ElasticNet(alpha=0.3, l1_ratio=0.8, max_iter=800)
decisiontreeregressor = DecisionTreeRegressor(criterion='absolute_error', max_depth=12, max_features='sqrt', 
                                              min_samples_leaf=8, min_samples_split=8)
randomforestregressor = RandomForestRegressor(n_estimators= 32, criterion= 'poisson', max_features= 'sqrt', 
                                              max_depth= 6, min_samples_leaf= 3, min_samples_split= 3)
gradientboostingregressor = GradientBoostingRegressor(n_estimators= 32, criterion= 'squared_error', 
                                                      max_features= 'sqrt', loss= 'squared_error')

In [None]:
# Store Regression models into list
models = [elastinet, decisiontreeregressor, randomforestregressor, gradientboostingregressor]

In [None]:
# Check the cross_val_score on all the models
def cross_src(model,X_trn,y_trn):
    src = np.round(cross_val_score(model, X_trn, y_trn, cv=5),2)
    msrc = np.round(np.mean(src),2)
    return src, msrc

In [None]:
# Testing with cross_val_score
result = []
mean_src = []

for model in models:
    cur_rst,cur_msrc = cross_src(model, X_train, y_train)
    print('Model Name: ',model)
    print('\n')
    print('Result :', cur_rst)
    print('Mean_Score :', cur_msrc)
    print('\n')
    
    result.append(cur_rst)
    mean_src.append(cur_msrc)

In [None]:
# Perform RandomizedSearchCV with cross-validation to find the best hyperparameter
params = {
    'elasticnet': {
        # 'alpha': [0.3, 0.6, 0.8, 1.0],
        # 'l1_ratio': [0.3, 0.6, 0.8, 1.0],
        # 'max_iter': [250, 500, 800, 1000],
        # 'selection': ['cyclic', 'random'],
    },
    'decisiontreeregressor': {
        # 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'splitter':['best','random'],
        # 'max_features':['sqrt','log2', 'auto'],
        # 'max_depth': [4, 6, 8],
        # 'min_samples_split': [2, 3, 4],
        # 'min_samples_leaf': [1, 2, 3, 4],
    },
    'randomforestregressor': {
        # 'n_estimators': [8, 16, 32],
        # 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'max_depth': [4, 6, 8],
        # 'min_samples_split': [2, 3, 4],
        # 'min_samples_leaf': [1, 2, 3, 4],
    },
    'gradientboostingregressor': {
        # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
        # 'learning_rate':[.1,.01,.05,.001],
        # 'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        # 'criterion':['squared_error', 'friedman_mse'],
        # 'max_features':['auto','sqrt','log2'],
        # 'n_estimators': [8, 16, 32]
    }
}

In [None]:
X_test.shape

In [None]:
# Check the train or test all loss/metrics the models 
def evaluate_model(true, prediction):
    mae = np.round(mean_absolute_error(true, prediction), 2)
    mape = np.round(mean_absolute_percentage_error(true, prediction), 2)
    r2 = np.round(r2_score(true, prediction), 2)
    ad_r2 = np.round(1-((1 - r2)*(38715 - 1) / (38715 - 23 - 1)), 2)
    
    return mae, mape, r2, ad_r2

In [None]:
# Train all models and get the traning loss/metrics evaluation
mae_src= []
mape_src= []
r2_src= []
adjt_r2 = []
train_src = []
test_src = []
results = []

for model in models:
    param = params[model.__class__.__name__.lower()]

    gs = RandomizedSearchCV(model, param_distributions=param, cv=3, verbose=1)
    gs.fit(X_train, y_train)

    model.set_params(**gs.best_params_)
    model.fit(X_train, y_train)

    # Make prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate model on test dataset
    trn_mae, trn_mape, trn_r2, trn_adj_r2 = evaluate_model(y_train, y_train_pred)
    cur_mae, cur_mape, cur_r2, cur_adj_r2 = evaluate_model(y_test, y_test_pred)
    
    print('Model performance on Test dataset')
    print('Model Name: ',model)
    print('\n')
    print('Mean_Absolute_Error :',cur_mae)
    print('Mean_Absolute_Precentage_Error :',cur_mape)
    print('R2_Score :',cur_r2)
    print('Adjusted_R2_Score :',cur_adj_r2)
    print('-'*30)
    print('\n')

    print('Model performance on Training dataset')
    print(f"Mean Absolute Error: {trn_mae}")
    print(f'Mean Absolute Precentage Error : {trn_mape}')
    print(f"R2 Score: {trn_r2}")
    print(f'Adjusted R2 Score : {trn_adj_r2}')
    print('-'*35)
    print('\n')

    mae_src.append(cur_mae)
    mape_src.append(cur_mape)
    r2_src.append(cur_r2)
    adjt_r2.append(cur_r2)
    
    results.append({
        'model': model.__class__.__name__,
        'y_actual': y_test,
        'y_pred': y_test_pred,
        'mae': cur_mae,
        'mape': cur_mape,
        'r2': cur_r2,
        'adj_r2': cur_adj_r2
    })
    
    print('f*** Model {model} Detail ***')
    train = np.round(model.score(X_train, y_train), 2)
    test = np.round(model.score(X_test, y_test), 2)
    print(f" Training Model score :\n {train}")
    print(f" Testing Model score :\n {test}")
    print('='*30)
    train_src.append(train)
    test_src.append(test)
    print('\n')

In [None]:
# Plotting the metrics graph of MAE & MAPE
epochs = range(len(models))
fig, ax = plt.subplots(figsize=(15,6))
plt.plot(epochs, mae_src, '-o', color='blue', label='MAE')
plt.plot(epochs, mape_src, '-o', color='green', label='MAPE')

ax.set_xticklabels(range(0,11,1), rotation = 90)
ax.set_xlabel(xlabel= models)
plt.grid(visible=True)
plt.show()

In [None]:
# Plotting the metrics graph of R2 & Adjusted R2
epochs = range(len(models))
fig, ax = plt.subplots(figsize=(15,6))
plt.plot(epochs, r2_src, '-o', color='red', label='R2')
plt.plot(epochs, adjt_r2, '-o', color='black', label='Adjusted R2')

ax.set_xticklabels(range(0,11,1), rotation = 90)
ax.set_xlabel(xlabel= models)
plt.grid(visible=True)
plt.show()

In [None]:
# Plotting the metrics graph
epochs = range(1, len(models) +1 )
plt.figure(figsize=(10, 6))

plt.plot(epochs, mae_src, marker='o', linestyle='-', color='blue', label='MAE')
plt.plot(epochs, mape_src, marker='o', linestyle='-', color='green', label='MAPE')
plt.plot(epochs, r2_src, marker='o', linestyle='-', color='yellow', label='R2')
plt.plot(epochs, adjt_r2, marker='o', linestyle='-', color='purple', label='Adjusted R2')

plt.title('Comparison of Metrics over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Metric Values')
plt.grid(visible=True)
plt.legend()
plt.show()

In [None]:
# All models error scores are convert into dataframe
perform_df = pd.DataFrame({'Algorithm': models, 'MAE': mae_src, 'MAPE': mape_src, 'R2': r2_src, 'Adjusted R2': adjt_r2, 'Model Train Score': train_src, 'Model Test Score': test_src})

In [None]:
perform_df

In [None]:
# Plotting actual vs. predicted values for each model
plt.figure(figsize=(12, 8))

for result in results:
    plt.scatter(result['y_actual'], result['y_pred'], label=result['model'], alpha=0.7)

plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', label='Perfect Prediction')

plt.title('Actual vs. Predicted Values for Different Models')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.grid(True)

plt.show()

## 3. Model testing

In [None]:
X.sample(5)

In [None]:
X.loc[48344]

In [None]:
y.loc[48344]

In [None]:
df.loc[48344]

In [None]:
# Users Input
user_input = np.array([0.55, 60.60, 59.00, 5.28, 5.31, 3.21, 3.00, 2.00, 2.00]).reshape(1, -1)
user_input

In [None]:
for model in models:
    # trf = qwt.transform(user_input)
    pred = model.predict(user_input)[0]

    print('Model Name: ',model)
    print('Predict price :', np.round(pred,2))
    print('\n')