In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/gemstone.csv')
df.head(5)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       193573 non-null  int64  
 1   carat    193573 non-null  float64
 2   cut      193573 non-null  object 
 3   color    193573 non-null  object 
 4   clarity  193573 non-null  object 
 5   depth    193573 non-null  float64
 6   table    193573 non-null  float64
 7   x        193573 non-null  float64
 8   y        193573 non-null  float64
 9   z        193573 non-null  float64
 10  price    193573 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 16.2+ MB


In [4]:
df.drop(['id'],axis = 1,inplace = True)

In [5]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [6]:
X = df.drop(['price'],axis = 1)
y = df['price']

In [7]:
categorical_data = [i for i in X.columns if X[i].dtype == 'O']
numerical_data = [i for i in X.columns if X[i].dtype != 'O']

In [8]:
print(f"All Categorical Columns is {categorical_data}")
print(f"All Categorical Columns is {numerical_data}")

All Categorical Columns is ['cut', 'color', 'clarity']
All Categorical Columns is ['carat', 'depth', 'table', 'x', 'y', 'z']


In [9]:
categorical_dict = {}
for i in categorical_data:
    categorical_dict[i+"_categorical"] = list(X[i].unique())

In [10]:
categorical_dict

{'cut_categorical': ['Premium', 'Very Good', 'Ideal', 'Good', 'Fair'],
 'color_categorical': ['F', 'J', 'G', 'E', 'D', 'H', 'I'],
 'clarity_categorical': ['VS2',
  'SI2',
  'VS1',
  'SI1',
  'IF',
  'VVS2',
  'VVS1',
  'I1']}

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

## Pipeline Libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
numerical_pipeline = Pipeline(
    steps = [
    ('imputer',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
    ])

categorical_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoding',OrdinalEncoder(categories=[categorical_dict['cut_categorical'],categorical_dict['color_categorical'],categorical_dict['clarity_categorical']])),
        ('scaling',StandardScaler())
    ])

preprocessor = ColumnTransformer([
    ('num_pipeline',numerical_pipeline,numerical_data),
    ('cat_pipeline',categorical_pipeline,categorical_data)
]
)

### Train-Test-Split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test , y_train, y_test = train_test_split(X,y,random_state=30,test_size=0.3)

In [14]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns = preprocessor.get_feature_names_out())

In [15]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.080970,-1.123150,0.652673,1.655662,1.604964
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,1.687566,0.056195,0.481725
2,0.494617,0.815855,0.399800,0.570855,0.606458,0.673737,-1.417114,-0.476960,-0.079894
3,-1.018676,0.260701,0.921131,-1.214034,-1.244270,-1.195605,-1.417114,-0.476960,-1.203134
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.652673,-1.010116,1.604964
...,...,...,...,...,...,...,...,...,...
135496,-1.040295,-0.016876,-0.642862,-1.268122,-1.244270,-1.239078,0.652673,0.056195,0.481725
135497,0.991842,0.168176,-0.642862,1.048629,1.114501,1.079486,0.652673,-0.476960,-0.641514
135498,0.451380,1.556060,-0.642862,0.516768,0.588314,0.702719,1.687566,-0.476960,0.481725
135499,0.667565,-1.774863,1.442462,0.868337,0.951202,0.688228,0.652673,-0.476960,-0.079894


In [16]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [17]:
def evaluate_matrix(y_actual,y_pred):
    mse = mean_squared_error(y_actual,y_pred)
    mae = mean_absolute_error(y_actual,y_pred)
    rmse = np.sqrt(mae)
    r2_square = r2_score(y_actual,y_pred)

    return mse,rmse,r2_square

In [18]:
def make_model(X_train,X_test,y_train,y_test):
    trained_model_list=[]
    r2_list=[]


    algo_dict = {'LinearRegression' : LinearRegression(),'Lasso' : Lasso(),'Ridge' : Ridge(),'ElasticNet' : ElasticNet()}

    for i in algo_dict:
        model = algo_dict[i]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)

        trained_model_list.append(model)

        mse, rmse, r2_square = evaluate_matrix(y_test,y_pred)

        r2_list.append(r2_square)

        print(i)
        print('Model Training Performance')
        print(f'MSE - {mse}')
        print(f'RMSE - {rmse}')
        print(f'r2_score {r2_square * 100}')
        print('--------------------------------------')





In [19]:
make_model(X_train,X_test,y_train,y_test)

LinearRegression
Model Training Performance
MSE - 1437191.717265036
RMSE - 26.94536877120538
r2_score 91.17706806392405
--------------------------------------
Lasso
Model Training Performance
MSE - 1436909.7347908183
RMSE - 26.971166890697553
r2_score 91.17879915668455
--------------------------------------
Ridge
Model Training Performance
MSE - 1437194.5722050248
RMSE - 26.94590393482368
r2_score 91.17705053742367
--------------------------------------
ElasticNet
Model Training Performance
MSE - 2660552.7587316004
RMSE - 33.680236725422
r2_score 83.66684442956674
--------------------------------------


In [20]:
df.drop(['id'],axis = 1,inplace = True)

KeyError: "['id'] not found in axis"

In [4]:
X = df.drop('price',axis = 1)
y = df[['price']]

In [5]:
categorical_columns = X.select_dtypes('object').columns
numerical_columns = X.select_dtypes(exclude = "object").columns

print(f'Categorical_Columns :- {categorical_columns}')
print(f'Numerical_Columns :- {numerical_columns}')

Categorical_Columns :- Index(['cut', 'color', 'clarity'], dtype='object')
Numerical_Columns :- Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [6]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [7]:
from sklearn.impute import SimpleImputer    # Handling Missing Values
from sklearn.preprocessing import StandardScaler  # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder  # Encoding of data

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [9]:
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprossor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_columns),
        ('cat_pipeline',cat_pipeline,categorical_columns)
    ]
)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=30)   

In [12]:
X_train = pd.DataFrame(preprossor.fit_transform(X_train),columns = X_train.columns)
X_test = pd.DataFrame(preprossor.transform(X_test),columns=X_test.columns)

In [13]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.080970,-1.123150,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.399800,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.244270,-1.195605,-0.132136,0.296826,0.019720
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.144670,1.352731
...,...,...,...,...,...,...,...,...,...
135496,-1.040295,-0.016876,-0.642862,-1.268122,-1.244270,-1.239078,0.874076,-0.935071,-0.646786
135497,0.991842,0.168176,-0.642862,1.048629,1.114501,1.079486,0.874076,0.296826,-1.313292
135498,0.451380,1.556060,-0.642862,0.516768,0.588314,0.702719,-2.144558,0.296826,-0.646786
135499,0.667565,-1.774863,1.442462,0.868337,0.951202,0.688228,0.874076,0.296826,0.686225


In [14]:
X_test

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,-0.564688,-0.942132,-0.642862,-0.429765,-0.464061,-0.500036,-0.132136,-0.935071,0.019720
1,-0.175556,1.000906,-0.121531,-0.042137,-0.028595,0.036132,-1.138347,0.912774,-0.646786
2,-1.061913,0.260701,-0.121531,-1.304180,-1.298703,-1.268060,0.874076,0.912774,2.685743
3,0.970223,-0.201927,1.963794,1.048629,0.996563,0.978049,-0.132136,0.296826,0.019720
4,-0.932202,-1.312235,0.399800,-1.006699,-0.990248,-1.065186,-0.132136,-0.935071,0.686225
...,...,...,...,...,...,...,...,...,...
58067,1.013460,1.185958,-0.642862,1.003556,1.041924,1.151941,-1.138347,0.912774,0.019720
58068,-0.997058,0.260701,-1.164193,-1.141917,-1.126331,-1.108659,0.874076,-0.319122,2.019237
58069,-0.197174,-3.347799,1.442462,0.102096,0.071199,-0.224706,-0.132136,2.144670,0.019720
58070,-0.824110,-0.201927,-0.121531,-0.853450,-0.881382,-0.876803,0.874076,0.296826,-0.646786


In [15]:
# Model Training


from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}

In [17]:
def evaluate_model(true_value,predict_value):
    r2_square = r2_score(true_value,predict_value)
    mae = mean_absolute_error(true_value,predict_value)
    mse = mean_squared_error(true_value,predict_value)

    return r2_square,mae,mse

In [18]:
# Define function for creating model

def create_model(X_train,X_test,y_train,y_test,models):

    for i in range(len(models)):    
        model = list(models.values())[i]

        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)

        r2_square,mae,mse = evaluate_model(y_test,y_pred)

        print(list(models.keys())[i])
        print("Model Training Performance")
        print('R2_Score :- ',r2_square)
        print('MSE :- ',mse)
        print("MAE :- ",mae)

        
        print('='*35)
        print('\n')


In [19]:
create_model(X_train,X_test,y_train,y_test,models)

LinearRegression
Model Training Performance
R2_Score :-  0.9368908248567511
MSE :-  1028002.7598132559
MAE :-  674.0255115796832


Lasso
Model Training Performance
R2_Score :-  0.9368940971841704
MSE :-  1027949.4559693959
MAE :-  675.0716923362161


Ridge
Model Training Performance
R2_Score :-  0.9368906732505937
MSE :-  1028005.2293677672
MAE :-  674.0555800798206


Elasticnet
Model Training Performance
R2_Score :-  0.8556494831165181
MSE :-  2351365.382289642
MAE :-  1060.7368759154729


