In [2]:
import pandas as pd

# Model Training

In [3]:
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
## lets drop the id column
df = df.drop(columns=["id"])
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
## Split X and y
X = df.drop("price", axis =1)
y = df['price']

In [6]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [7]:
categorical_colummns  = X.select_dtypes("object").columns
numerical_columns = X.select_dtypes("number").columns

In [19]:
categorical_colummns

Index(['cut', 'color', 'clarity'], dtype='object')

In [8]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair','Good','Very Good', 'Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1', 'VVS2','VVS1','IF'] 

In [9]:
df['color'].unique()

array(['F', 'J', 'G', 'E', 'D', 'H', 'I'], dtype=object)

In [10]:
from sklearn.impute import SimpleImputer  ##Handling Missing Values
from sklearn.preprocessing import StandardScaler #handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder #Ordinal Encoding

##pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer #to combine two different pipeline


In [None]:
## Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline,numerical_columns),
    ('cat_pipeline', cat_pipeline,categorical_colummns)
])

In [12]:
## Traing test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.30, random_state=1)


In [13]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,2.758681,0.631711,-0.119267,2.172812,2.118887,2.220999,-0.135503,1.525503,-0.647427
1,0.903141,0.169421,-0.640178,0.98427,1.040573,1.026856,-1.143448,0.293897,-1.313938
2,1.550422,-0.570243,1.443466,1.515512,1.448339,1.390924,-0.135503,0.293897,-0.647427
3,0.5795,-1.032533,-1.161089,0.768171,0.814036,0.677351,0.872443,-1.553512,-0.647427
4,-0.973975,-0.015495,-0.119267,-1.095679,-1.097932,-1.084738,0.872443,1.525503,-0.647427


In [15]:
#Model training
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [16]:
def evaluate_model(true, prediction):
    mae = mean_absolute_error(true, prediction)
    mse = mean_squared_error(true, prediction)
    r2 = r2_score(true, prediction)
    rmse = np.sqrt(mean_squared_error(true, prediction))
    return mae,  r2, rmse
    
                   

In [17]:
models_dict = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
}


trained_model_list = []
model_list = []
r2_list = []

for model_name, model in models_dict.items():
    model.fit(X_train, y_train)

    # Make Prediction
    y_pred = model.predict(X_test)
    mae, r2, rmse = evaluate_model(y_test, y_pred)

    print(model_name)
    model_list.append(model_name)

    print("Model Training Performance")
    print("RMSE", rmse)
    print("MAE", mae)
    print("R2 score", r2 * 100)

    r2_list.append(r2)
    print('=' * 35)
    print('\n')

    trained_model_list.append(model)


LinearRegression
Model Training Performance
RMSE 1012.1792803995619
MAE 672.9617720149348
R2 score 93.66157287500738


Lasso
Model Training Performance
RMSE 1010.4024369400458
MAE 673.8172864700617
R2 score 93.68380709307652


Ridge
Model Training Performance
RMSE 1012.2023151019246
MAE 672.9936437915943
R2 score 93.66128437780844


ElasticNet
Model Training Performance
RMSE 1534.65666395778
MAE 1063.3673862260225
R2 score 85.42900951017094


DecisionTree
Model Training Performance
RMSE 833.8823896055193
MAE 423.9905117784819
R2 score 95.69794240704923


RandomForest
Model Training Performance
RMSE 602.9497256287285
MAE 310.14556619100074
R2 score 97.75079614421178


XGBoost
Model Training Performance
RMSE 575.1505303065823
MAE 296.47825191217703
R2 score 97.95341557497372




In [18]:
pd.DataFrame({'Models': model_list,
              'R2 Score': r2_list})

Unnamed: 0,Models,R2 Score
0,LinearRegression,0.936616
1,Lasso,0.936838
2,Ridge,0.936613
3,ElasticNet,0.85429
4,DecisionTree,0.956979
5,RandomForest,0.977508
6,XGBoost,0.979534
