In [24]:
import pandas as pd

df= pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [25]:
df.drop(columns='id',inplace=True)

In [26]:
X = df.drop(columns='price',axis=1)
y = df['price']

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [28]:
categorical_columns = X.select_dtypes(include='object').columns
numerical_columns = X.select_dtypes(exclude='object').columns
print(categorical_columns)
print(numerical_columns)


Index(['cut', 'color', 'clarity'], dtype='object')
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [29]:
cut_catagory = list(df['cut'].unique())
color_catagory = list(df['color'].unique())
clarity_catagory = list(df['clarity'].unique())

In [30]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(135501, 9)
(58072, 9)
(135501,)
(58072,)


In [31]:
num_pipeline = Pipeline(steps=
                        [
                                ('imputer',SimpleImputer(strategy='median')),
                                ('scaler',StandardScaler())
                                ]
                        )
cat_pipeline = Pipeline(steps=
                        [
                            ('imputer',SimpleImputer(strategy='most_frequent')),
                            ('ordinal_encoder',OrdinalEncoder(categories=[cut_catagory,color_catagory,clarity_catagory]))
                        ])
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',cat_pipeline,categorical_columns)
]
    )

In [32]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [33]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,2.0,3.0,5.0
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,1.0,5.0,2.0
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,0.0,5.0,0.0
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,2.0,0.0,6.0
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,1.0,6.0,3.0


In [34]:
# model training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [35]:
import numpy as np

def model_evaluate(true,predicted):
    mae = mean_squared_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2 = r2_score(true,predicted)

    return mae,mse,rmse,r2



In [36]:
models = {
    'Linear Regression' : LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'ElasticNet' : ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []




In [52]:
models.values()

dict_values([LinearRegression(), Lasso(), Ridge(), ElasticNet()])

In [55]:
for model in models.values():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    mae,mse,rmse,r2 = model_evaluate(y_test,y_pred)
    print('Model Name : ',model)
    print('='*20)
    print('Model Performance')
    print('Mean Absolute error',mae)
    print('Mean squared error',mse)
    print('Root mean squared error',rmse)
    print('R2 Score',r2)

Model Name :  LinearRegression()
Model Performance
Mean Absolute error 1432389.7972102694
Mean squared error 1432389.7972102694
Root mean squared error 1196.824881597249
R2 Score 0.911356057138532
Model Name :  Lasso()
Model Performance
Mean Absolute error 1432323.3691459396
Mean squared error 1432323.3691459396
Root mean squared error 1196.7971294860042
R2 Score 0.911360168062493
Model Name :  Ridge()
Model Performance
Mean Absolute error 1432403.9485246255
Mean squared error 1432403.9485246255
Root mean squared error 1196.8307936064418
R2 Score 0.9113551813795007
Model Name :  ElasticNet()
Model Performance
Mean Absolute error 2660372.0536648366
Mean squared error 2660372.0536648366
Root mean squared error 1631.0646994110432
R2 Score 0.8353619463259178
