# Model training

In [1]:
import pandas as pd

In [4]:
df=pd.read_csv("Data/gemstone.csv")

In [6]:
df.drop(columns=["id"],axis=1,inplace=True)

In [10]:
#Independent and dependent variables
X=df.drop(columns=["price"],axis=1)
Y=df[["price"]]

In [40]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [39]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [14]:
from sklearn.impute import SimpleImputer #For handling missing values
from sklearn.preprocessing import StandardScaler # For Feature scaling (Must - if you're using Linear regression)
from sklearn.preprocessing import OrdinalEncoder#(This will automatically give ranks)Feature Engineering- Ordinal encoding
##pipeline si just to connect the simple imputer to standardScaler to ordinalEncoder 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer

In [41]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


Here we've created two different pipelines and we need to join them using column Transformer


In [34]:
#Combiningg
preprocessor=ColumnTransformer([
    ('num_pipeline',numerical_pipeline,Numerical_columns),
    ('cate_pipeline',categorical_pipeline,categorical_columns)])

# Model Training

In [42]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [44]:
preprocessor

In [43]:
preprocessor.fit_transform(X_train)

array([[-0.97543926, -0.84960654, -0.12153081, ...,  0.87407553,
         1.52872212,  1.35273128],
       [ 0.2351953 ,  1.83363716, -0.12153081, ..., -2.14455824,
        -0.93507064, -0.64678628],
       [ 0.49461699,  0.81585507,  0.39980029, ..., -0.13213573,
         0.29682574,  0.68622543],
       ...,
       [ 0.45138004,  1.55606023, -0.6428619 , ..., -2.14455824,
         0.29682574, -0.64678628],
       [ 0.66756478, -1.77486298,  1.44246248, ...,  0.87407553,
         0.29682574,  0.68622543],
       [ 0.25681377,  0.81585507, -0.12153081, ...,  0.87407553,
         0.29682574, -0.64678628]])

# Now all features are scaled and encoded into numeric values of same unit

In [47]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [49]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

## Model Training

In [50]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [51]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [52]:
regression.coef_

array([[ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
         -499.29302619,   -63.39317848,    72.44537247,  -460.41604642,
          650.76431652]])

In [53]:
regression.intercept_

array([3970.76628955])

# Model Performance

In [54]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

## Training multiple models

In [57]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet(),
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    ##Make predictions
    y_pred=model.predict(X_test)
    mae, rmse, r2_square=evaluate_model(y_test,y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

    
    

LinearRegression
Model Training Performance
RMSE: 1013.9047094344004
MAE: 674.0255115796832
R2 score 93.68908248567512


Lasso
Model Training Performance
RMSE: 1013.8784226767013
MAE: 675.071692336216
R2 score 93.68940971841704


Ridge
Model Training Performance
RMSE: 1013.9059272771649
MAE: 674.0555800798218
R2 score 93.68906732505937


ElasticNet
Model Training Performance
RMSE: 1533.4162456064048
MAE: 1060.7368759154729
R2 score 85.56494831165182




Calculate the adjusted r2_score

In [70]:
#Checking which ML algorithm got highest accuracy
list(models.keys())[r2_list.index(max(r2_list))]

'Lasso'

In [71]:
#Checking which ML algorithm got least accuracy
list(models.keys())[r2_list.index(min(r2_list))]

'ElasticNet'