In [1]:
import pandas as pd


In [2]:
df=pd.read_csv("Diamonds Prices2022.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.drop(labels=["id"],axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
#independent features and dependent
x=df.drop(labels=['price'],axis=1)
y=df[['price']]

In [6]:
x

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,5.83,5.87,3.64
53940,0.71,Premium,E,SI1,60.5,55.0,5.79,5.74,3.49
53941,0.71,Premium,F,SI1,59.8,62.0,5.74,5.73,3.43


In [7]:
y

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335
...,...
53938,2757
53939,2757
53940,2756
53941,2756


In [8]:
#segregating numerical and object 
categorical_cols=x.select_dtypes(include="object").columns
numerical_cols=x.select_dtypes(exclude="object").columns

In [9]:
categorical_cols,numerical_cols

(Index(['cut', 'color', 'clarity'], dtype='object'),
 Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))

In [10]:
cut_ord=["Fair" ,"Good" ,"Very Good" ,"Premium" ,"Ideal"]
color_ord=["J" ,"I" ,"H" ,"G" ,"F" ,"E" ,"D"]
clarity_ord=["I1" ,"SI2" ,"SI1" ,"VS2" ,"VS1" ,"VVS2" ,"VVS1" ,"IF"]

In [11]:
from sklearn.impute import SimpleImputer    #handling missing values
from sklearn.preprocessing import StandardScaler  #handling feature scaling
from sklearn.preprocessing import OrdinalEncoder  #handling categorical values
from sklearn.pipeline import Pipeline             #pipeline for automatic flow
from sklearn.compose import ColumnTransformer


In [12]:
##data transformation
##numerical pipeline
numerical_pipeline=Pipeline(
    steps=[
        ("scaler",StandardScaler()),
        ("imputer",SimpleImputer(strategy="median"))
        ]
                             )

In [13]:
##categorical pipeline
categorical_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy='most_frequent')),
        ("OrdinalEncoder",OrdinalEncoder(categories=[cut_ord,color_ord,clarity_ord])),
        ("scaler",StandardScaler())
    ]
)

In [14]:
preprocessor=ColumnTransformer(
    [
    ("num_pipeline",numerical_pipeline,numerical_cols),
    ("cat_pipeline",categorical_pipeline,categorical_cols)
],
)

In [15]:
#train and test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=30)
x_train.shape,x_test.shape

((36141, 9), (17802, 9))

In [16]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [17]:
x_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.430938,-1.158160,2.043036,0.689843,0.639334,0.523271,0.087478,-1.415225,0.572477
1,0.261545,-0.180531,-1.999918,0.475424,0.492029,0.465686,0.981326,0.352245,-1.249191
2,2.082522,0.517776,-0.203049,1.815543,1.791775,1.905293,-0.806370,1.530559,-1.249191
3,0.452112,0.797099,0.246168,0.564765,0.578679,0.681627,-0.806370,-1.415225,-0.641968
4,0.494460,-0.739176,1.144602,0.689843,0.708653,0.609647,0.981326,-1.415225,-0.641968
...,...,...,...,...,...,...,...,...,...
36136,-1.051253,0.587607,-1.550701,-1.275666,-1.214970,-1.204258,0.981326,-0.236911,2.394145
36137,-0.585422,-0.390023,-0.652267,-0.480529,-0.313813,-0.484454,0.981326,0.352245,-0.034745
36138,-0.606596,0.098792,-1.101484,-0.525199,-0.487112,-0.498850,0.981326,-0.236911,1.179700
36139,0.430938,1.565237,0.695385,0.502226,0.440039,0.681627,-1.700217,1.530559,-1.249191


In [18]:
x_test

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,1.489646,-3.183251,2.043036,1.708334,1.627140,1.185490,-1.700217,-1.415225,0.572477
1,1.553168,0.098792,-0.203049,1.449244,1.436511,1.473411,0.981326,-0.826068,1.179700
2,0.219196,1.285914,-0.203049,0.332477,0.266740,0.465686,-1.700217,-0.236911,-1.249191
3,0.748550,-0.040869,0.695385,0.850657,0.864623,0.868776,0.981326,-1.415225,-0.034745
4,-0.627770,1.076422,-1.999918,-0.596672,-0.556432,-0.470058,-0.806370,-0.236911,1.786922
...,...,...,...,...,...,...,...,...,...
17797,-0.458377,-0.110700,-0.203049,-0.346517,-0.313813,-0.340493,0.981326,-0.826068,2.394145
17798,-0.839512,0.378115,-1.101484,-0.900433,-0.851041,-0.844356,0.981326,-1.415225,-0.641968
17799,0.219196,0.308284,2.492253,0.368214,0.431374,0.451290,-1.700217,1.530559,-1.249191
17800,-0.648944,-0.669346,1.144602,-0.543068,-0.599757,-0.642811,0.087478,0.352245,-1.249191


## model training

In [19]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

regresion=LinearRegression()
regresion.fit(x_train,y_train)

In [20]:
y_pred=regresion.predict(x_test)

In [21]:
regresion.coef_

array([[ 5.28105426e+03, -1.26311069e+02, -5.94055935e+01,
        -1.18022809e+03,  3.06483217e+01, -3.36832996e+00,
         1.46245917e+02,  5.48702475e+02,  8.12338072e+02]])

In [22]:
r2_score(y_test,y_pred)

0.9019674227689983

In [23]:
def evaluate_model(actual,pred):
    r2=r2_score(actual,pred)
    mae=mean_absolute_error(actual,pred)
    mse=mean_squared_error(actual,pred)
    return r2,mae,mse


In [24]:
evaluate_model(y_test,y_pred)

(0.9019674227689983, 803.5930239714152, 1573179.8713194362)

## train multiple models

In [34]:
models={
        "LinearRegression":LinearRegression(),
        "Lasso":Lasso(),
        "Ridge":Ridge(),
        "ElasticNet":ElasticNet()
        }

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    #makes prediction
    y_pred=model.predict(x_test)
    r2,mae,mse=evaluate_model(y_test,y_pred)
    model_list.append(list(models.keys())[i])

    print(f"model performance {list(models.keys())[i]}")
    print("R-squared-->",r2*100)
    print("MAE      -->",mae)
    print("MSE      -->",mse)
    print("\n*******************************\n")



model performance LinearRegression
R-squared--> 90.19674227689983
MAE      --> 803.5930239714152
MSE      --> 1573179.8713194362

*******************************

model performance Lasso
R-squared--> 90.21257566573665
MAE      --> 804.7183903638418
MSE      --> 1570639.004873158

*******************************

model performance Ridge
R-squared--> 90.19829964462294
MAE      --> 803.6968828835135
MSE      --> 1572929.9524023354

*******************************

model performance ElasticNet
R-squared--> 83.27440512793112
MAE      --> 1075.1652679303222
MSE      --> 2684043.399836414

*******************************

