# `MODEL TRAINING`

## Automating EDA process

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/gemstone.csv")

In [3]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
df = df.drop(columns="id",axis=1)

In [5]:
## Independent and dependent features
X = df.drop(columns="price",axis=1)
Y = df[["price"]]

In [6]:
## Define which column should be ordinal encoded and which should be scaled
## Spliting X data
categorical_col = X.select_dtypes(include="object").columns
numerical_col = X.select_dtypes(exclude="object").columns

In [7]:
print(categorical_col)
print(numerical_col)

Index(['cut', 'color', 'clarity'], dtype='object')
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [8]:
## Define the custom ranking for each ordinal variable
cut_categories = ["Fair","Good","Very Good","Premium","Ideal"]
color_categories = ["D","E","F","G","H","I","J"]
clarity_categories = ["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]

In [9]:
## using this library to automate the handling of missing value it will fill all missing value with (mean,median and mode)
from sklearn.impute import SimpleImputer

## using this library for feature scaling 
from sklearn.preprocessing import StandardScaler

## using this library for converting catgorical features into numerical features
## whenever our catorical features have rank we use OrdinalEncoder, else we use OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

## using library for pipeline, we need to perform simpleImputer then pass the result to StandartScaler then pass the result to Encoder
## pipeline just combines the step
## pipline is just connecting
from sklearn.pipeline import Pipeline

## now the connection is done we need to group this 
## for that we use library 
from sklearn.compose import ColumnTransformer

In [17]:
# Creating pipeline

# Numerical pipeline

num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

# Catagorical pipeline

cat_pipeline = Pipeline(
    steps= [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

# Now we have two pipelines lets combine it 
preprocessor = ColumnTransformer(
    [
    ('numPipeline',num_pipeline,numerical_col),
    ('catPipeline',cat_pipeline,categorical_col)
    ]
)

In [19]:
preprocessor

In [112]:
# Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.30,random_state=30)

In [113]:
print(X_train,y_train,X_test,y_test)

        carat      cut color clarity  depth  table     x     y     z
168192   0.34    Ideal     I    VVS2   60.9   57.0  4.56  4.53  2.76
35202    0.90     Good     E     SI1   63.8   57.0  6.07  6.03  3.87
41091    1.02  Premium     G     VS1   62.7   58.0  6.35  6.39  4.00
31239    0.32  Premium     G     VS2   62.1   59.0  4.37  4.35  2.71
45722    0.35    Ideal     J    VVS2   61.1   56.0  4.53  4.57  2.78
...       ...      ...   ...     ...    ...    ...   ...   ...   ...
66455    0.31    Ideal     E     SI1   61.8   56.0  4.31  4.35  2.68
46220    1.25    Ideal     G     SI2   62.0   56.0  6.88  6.95  4.28
98804    1.00     Good     G     SI1   63.5   56.0  6.29  6.37  4.02
48045    1.10    Ideal     G     VS1   59.9   60.0  6.68  6.77  4.01
169765   0.91    Ideal     G     SI1   62.7   57.0  6.14  6.18  3.86

[135501 rows x 9 columns]         price
168192    765
35202    4763
41091    6139
31239     720
45722     774
...       ...
66455     544
46220    5694
98804    4563
48045

In [114]:
# Our preprocessor has scaler transform, so doing scaler transform of X_train and X_test
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [115]:
## Model training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [116]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [117]:
regression.coef_

array([[ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
         -499.29302619,   -63.39317848,    72.44537247,  -460.41604642,
          650.76431652]])

In [118]:
regression.intercept_

array([3970.76628955])

In [119]:
y_prediction = regression.predict(X_test)

In [120]:
import numpy as np

def evalute_model(true,prediction)->dict:
    mae = mean_absolute_error(true,prediction)
    mse = mean_squared_error(true,prediction)
    rmse = np.sqrt(mean_squared_error(true,prediction))
    r2_square = r2_score(true,prediction)
    accuracy = {"mae":mae,"mse":mse,"rmse":rmse,"r2_score":r2_square}
    return accuracy

In [121]:
evalute_model(y_test,y_prediction)

{'mae': 674.0255115796832,
 'mse': 1028002.7598132559,
 'rmse': 1013.9047094344004,
 'r2_score': 0.9368908248567511}

Lets train multiple models once

In [122]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet()
}


model_list = []
r2_square = []

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)

    model_list.append(model)

    acc = evalute_model(y_test,y_pred)

    r2_square.append(list(acc.values())[3] *100)

    
    print(list(models.keys())[i])
    print(acc)
    print("="*50)


LinearRegression
{'mae': 674.0255115796832, 'mse': 1028002.7598132559, 'rmse': 1013.9047094344004, 'r2_score': 0.9368908248567511}
Ridge
{'mae': 674.0555800798212, 'mse': 1028005.2293677665, 'rmse': 1013.9059272771643, 'r2_score': 0.9368906732505938}


Lasso
{'mae': 675.0716923362158, 'mse': 1027949.4559693958, 'rmse': 1013.8784226767013, 'r2_score': 0.9368940971841704}
ElasticNet
{'mae': 1060.7368759154729, 'mse': 2351365.382289642, 'rmse': 1533.4162456064048, 'r2_score': 0.8556494831165181}


In [123]:
model_list

[LinearRegression(), Ridge(), Lasso(), ElasticNet()]

In [124]:
r2_square

[93.68908248567512, 93.68906732505938, 93.68940971841704, 85.56494831165182]