In [51]:
import pandas as pd
import numpy as np

In [52]:
df=pd.read_csv(r"data\gemstone.csv")

In [53]:
df.sample(2)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
143134,143134,1.02,Premium,G,VS2,59.3,59.0,6.5,6.56,3.88,6486
98928,98928,1.01,Good,D,SI2,64.3,58.0,6.27,6.32,4.03,4060


In [54]:
df=df.drop(labels="id",axis=True)

In [55]:
x=df.drop(labels="price",axis=1)
y=df["price"]
categorical_columns=x.select_dtypes(include='object').columns
numerical_columns=x.select_dtypes(exclude='object').columns

In [56]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [57]:
numerical_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer()),
        ('scaler',StandardScaler())
    ]

)

In [58]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [59]:
categorical_pipeline=Pipeline(
    
        steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
    
)
    


In [60]:
preprocessor=ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_columns),
    ('categorical_pipeline',categorical_pipeline,categorical_columns)]

)

In [61]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30)

In [62]:
preprocessor.fit_transform(x_train)


array([[-1.06035577, -0.66402845, -0.63981151, ...,  4.        ,
         1.        ,  5.        ],
       [-0.93061192,  1.36605897,  0.92462733, ...,  2.        ,
         1.        ,  1.        ],
       [ 0.23708271,  1.55061237,  0.92462733, ...,  1.        ,
         2.        ,  2.        ],
       ...,
       [-1.06035577,  1.27378227, -0.63981151, ...,  2.        ,
         4.        ,  5.        ],
       [-1.01710782,  1.18150556, -0.63981151, ...,  1.        ,
         2.        ,  4.        ],
       [ 1.66426503, -2.97094598,  0.92462733, ...,  3.        ,
         3.        ,  3.        ]])

In [66]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [63]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [67]:
models={'LinearRegression':LinearRegression(),
        'Lasso':Lasso(),
        'Ridge':Ridge(),
        'ElasticNet':ElasticNet()}