<a href="https://www.kaggle.com/code/msreevarshini/notebook1f4b3fa7d2?scriptVersionId=134781205" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# Importing pandas for data analysis
import pandas as pd

# Importing numpy for n-dimensional arrays 
import numpy as np

# Importing matplotlib.pyplot for data visualisation 
import matplotlib.pyplot as plt

# Importing LabelEncoder from scikit-learn for encoding categorial features into numerical values
from sklearn.preprocessing import LabelEncoder

# Importing MinMaxScaler from scikit-learn for scaling numerical features to a specific range
from sklearn.preprocessing import MinMaxScaler

# Importing train-test-split from scikit-learn to split the dataset into trainset and testset
from sklearn.model_selection import train_test_split

# Importing blank model using LinearRegression
from sklearn.linear_model import LinearRegression

# Importing blank model using Ridge
from sklearn.linear_model import Ridge

# Importing blank model using Lasso
from sklearn.linear_model import Lasso

# Importing blank model using ElasticNet
from sklearn.linear_model import ElasticNet

# Importing blank model using DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor

# Importing blank model using RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Importing blank model using SupportVectorRegressor
from sklearn.svm import SVR

# Importing blank model using GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Importing accuracy_score to evaluate the ml models
from sklearn.metrics import accuracy_score

In [2]:
# Loading the dataset
dataframe = pd.read_csv("/kaggle/input/diamonds/diamonds.csv")

In [3]:
dataframe

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [4]:
# Removing the index column since its not required
dataframe.drop(dataframe.columns[0], axis=1, inplace=True)

In [5]:
# Extracting the price from the dataset
Y = dataframe['price']
X = dataframe.drop('price', axis=1)

In [6]:
Y

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53940, dtype: int64

In [7]:
# Finding unique elements
print(f"Count of unique CUTS : {len(X['cut'].unique())}")
print(f"Count of unique COLORS : {len(X['color'].unique())}")
print(f"Count of unique CLARITIES : {len(X['clarity'].unique())}")

Count of unique CUTS : 5
Count of unique COLORS : 7
Count of unique CLARITIES : 8


In [8]:
# Performing Label_Encoding
label_encoder = LabelEncoder()

# Encoding CUT, COLOR, CLARITY columns
# Mapping the encoded numerical values with their original categorical values with the help of a dictionary 
X['cut'] = label_encoder.fit_transform(X['cut'])
cut_map = {index: label for index, label in enumerate(label_encoder.classes_)}

X['color'] = label_encoder.fit_transform(X['color'])
color_map = {index: label for index, label in enumerate(label_encoder.classes_)}

X['clarity'] = label_encoder.fit_transform(X['clarity'])
clarity_map = {index: label for index, label in enumerate(label_encoder.classes_)}

In [9]:
print(cut_map)
print(color_map)
print(clarity_map)

{0: 'Fair', 1: 'Good', 2: 'Ideal', 3: 'Premium', 4: 'Very Good'}
{0: 'D', 1: 'E', 2: 'F', 3: 'G', 4: 'H', 5: 'I', 6: 'J'}
{0: 'I1', 1: 'IF', 2: 'SI1', 3: 'SI2', 4: 'VS1', 5: 'VS2', 6: 'VVS1', 7: 'VVS2'}


In [10]:
pd.DataFrame(X)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,2,0,2,60.8,57.0,5.75,5.76,3.50
53936,0.72,1,0,2,63.1,55.0,5.69,5.75,3.61
53937,0.70,4,0,2,62.8,60.0,5.66,5.68,3.56
53938,0.86,3,4,3,61.0,58.0,6.15,6.12,3.74


In [11]:
# Preprocessing the data by feature scaling
scaling = MinMaxScaler()
X = scaling.fit_transform(X)

In [12]:
# Splitting the data into train_set and test_set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [13]:
# Applying Linear Regression 
standard_model = LinearRegression()
LR_lasso_model = Lasso(alpha=1)
LR_ridge_model = Ridge(alpha=1)
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5)

standard_model.fit(X_train, Y_train)
LR_lasso_model.fit(X_train, Y_train)
LR_ridge_model.fit(X_train, Y_train)
elastic_model.fit(X_train, Y_train)

In [14]:
# Applying Random Forest Regression
ran_for = RandomForestRegressor()

ran_for.fit(X_train, Y_train)

In [15]:
# Applying Decision Tree Regression
decision = DecisionTreeRegressor()

decision.fit(X_train, Y_train)

In [16]:
# Applying Support Vector Regression 
support = SVR(kernel="rbf")

support.fit(X_train, Y_train)

In [17]:
# Applying Gradient Boosting Regression
g_boost = GradientBoostingRegressor()

g_boost.fit(X_train, Y_train)

In [18]:
# Analysing the accuracy with and without regularisation(-adding a penalty term) and other regression algorithms
print(f"Without Regularisation: {100*(standard_model.score(X_test, Y_test))}")
print(f"Lasso Regularisation: {100*(LR_lasso_model.score(X_test, Y_test))}")
print(f"Ridge Regularisation: {100*(LR_ridge_model.score(X_test, Y_test))}")
print(f"Elastic Net Regression: {100*(elastic_model.score(X_test, Y_test))}")
print(f"Random Forest Regression: {100*(ran_for.score(X_test, Y_test))}")
print(f"Decision Tree Regression: {100*(decision.score(X_test, Y_test))}")
print(f"Support Vector Regression: {100*(support.score(X_test, Y_test))}")
print(f"Gradient Boosting Regression: {100*(g_boost.score(X_test, Y_test))}")


Without Regularisation: 85.53420955062482
Lasso Regularisation: 88.61498208660544
Ridge Regularisation: 88.64021322188307
Elastic Net Regression: 7.371731630352752
Random Forest Regression: 98.02415649953721
Decision Tree Regression: 96.29899163584396
Support Vector Regression: 32.40197237091438
Gradient Boosting Regression: 96.99038480417622
