In [8]:
import pandas as pd

In [9]:
# Data Ingestion
df = pd.read_csv('data/Diamond Price Prediction.csv')
df.head()

Unnamed: 0,Carat(Weight of Daimond),Cut(Quality),Color,Clarity,Depth,Table,Price(in US dollars),X(length),Y(width),Z(Depth)
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [10]:
df.duplicated().sum()

146

In [11]:
df = df.drop_duplicates()

In [12]:
df.duplicated().sum()

0

In [13]:
# Independent Features And Dependent Features
X = df.drop(labels=['Price(in US dollars)'],axis = 1)
Y = df['Price(in US dollars)']

In [14]:
X

Unnamed: 0,Carat(Weight of Daimond),Cut(Quality),Color,Clarity,Depth,Table,X(length),Y(width),Z(Depth)
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [16]:
# Segregeting Categoricala and Numericals
cat_feat = X.select_dtypes(include='object').columns
num_feat = X.select_dtypes(exclude='object').columns
print(cat_feat)
print(num_feat)

Index(['Cut(Quality)', 'Color', 'Clarity'], dtype='object')
Index(['Carat(Weight of Daimond)', 'Depth', 'Table', 'X(length)', 'Y(width)',
       'Z(Depth)'],
      dtype='object')


In [17]:
# Define the custom ranking for each ordinal variable
cut_feat = ['Fair','Good','Very Good','Premium','Ideal']
color_feat = ['D','E','F','G','H','I','J']
clarity_feat = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [19]:
#Numerical Features
num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scalar' , StandardScaler())
    ]
)

# Categorical Features
cat_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_feat,color_feat,clarity_feat])),
        ('scalar' , StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_feat),
    ('cat_pipeline',cat_pipeline,cat_feat)
]
)

In [20]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.30,random_state=30)

In [22]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns = preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns = preprocessor.get_feature_names_out())

In [24]:
X_train.head()

Unnamed: 0,num_pipeline__Carat(Weight of Daimond),num_pipeline__Depth,num_pipeline__Table,num_pipeline__X(length),num_pipeline__Y(width),num_pipeline__Z(Depth),cat_pipeline__Cut(Quality),cat_pipeline__Color,cat_pipeline__Clarity
0,-1.051584,-0.033284,0.243924,-1.284193,-1.278314,-1.277823,0.980476,-0.937929,-0.639483
1,0.429992,1.435995,0.691446,0.465214,0.525171,0.681416,-0.810878,-0.937929,-0.029483
2,0.916795,0.036682,-0.651121,1.036449,0.989434,1.012758,0.980476,-0.349957,1.190516
3,-1.030418,0.87627,-0.203599,-1.284193,-1.260458,-1.191386,-0.810878,-0.937929,-0.639483
4,0.218338,1.016201,-0.651121,0.375959,0.346608,0.494136,-0.810878,1.413958,-0.639483


In [25]:
## Model Training
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [27]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [28]:
reg.coef_

array([ 5160.67602012,  -121.55709618,   -63.07364595, -1168.73269142,
         145.14252022,    -6.80040089,   136.53534809,  -546.34323226,
         815.72120944])

In [33]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)

    return mae,rmse,r2_square

In [37]:
# Train Multiple Models
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet(),
}

train_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('RMSE:', rmse)
    print('MAE:', mae)
    print('R2 Square:', r2_square)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1228.3374233355175
MAE: 805.925377319053
R2 Square: 0.9051115842202946


Lasso
Model Training Performance
RMSE: 1227.1733488586272
MAE: 806.8005526586712
R2 Square: 0.9052913472750835


Ridge
Model Training Performance
RMSE: 1228.28666852301
MAE: 806.0188660785935
R2 Square: 0.9051194256230903


ElasticNet
Model Training Performance
RMSE: 1642.540239949717
MAE: 1078.1266417701347
R2 Square: 0.8303281228188145


