In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import os

In [2]:
for _, dirnames, filenames in os.walk('C:\\Users\\ajaym\\Desktop\\DiamondPricePrediction\\JupyterNotebook\\Data'):
    for filename in filenames:
        print(filename)

archive.zip
diamonds.csv


In [3]:
mydf = pd.read_csv('C:\\Users\\ajaym\\Desktop\\DiamondPricePrediction\\JupyterNotebook\\Data\\diamonds.csv')

mydf.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
copiedDf = mydf.copy()

In [5]:
## separating dependent_variables and independent_variables from the dataset

independent_variable = copiedDf.drop(columns=['price'], axis = 1)

In [6]:
independent_variable.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [7]:
dependent_variable = copiedDf[['price']]
dependent_variable.head()

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335


In [8]:
## extracting categorical features and numerical features from the dataset

numerical_features = [feature for feature in independent_variable.columns if independent_variable[feature].dtype != 'O']

categorical_features = [feature for feature in independent_variable.columns if independent_variable[feature].dtype == 'O']

In [9]:
print('The number of numerical_features are {} and they are {}'.format(len(numerical_features), numerical_features))
print('The number of categorical_features are {} and they are {}'.format(len(categorical_features), categorical_features))

The number of numerical_features are 6 and they are ['carat', 'depth', 'table', 'x', 'y', 'z']
The number of categorical_features are 3 and they are ['cut', 'color', 'clarity']


In [10]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [11]:
preprocessor = ColumnTransformer(
transformers = [('ordinalencoder', OrdinalEncoder(), categorical_features),
               ('standardscaler', StandardScaler(), numerical_features)]

)

In [12]:
preprocessor

In [13]:
independent_variable.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [14]:
dependent_variable.head()

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335


In [15]:
X = independent_variable.copy()

In [16]:
processed_data = preprocessor.fit_transform(X)

In [17]:
final_df = pd.DataFrame(processed_data)

In [18]:
final_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2.0,1.0,3.0,-1.198405,-0.176795,-1.101144,-1.588868,-1.534572,-1.57138
1,3.0,1.0,2.0,-1.240495,-1.364714,1.586944,-1.642293,-1.656782,-1.741097
2,1.0,1.0,4.0,-1.198405,-3.391165,3.379003,-1.499826,-1.456009,-1.741097
3,3.0,5.0,5.0,-1.072134,0.452104,0.2429,-1.366263,-1.31634,-1.288517
4,1.0,6.0,3.0,-1.030043,1.081003,0.2429,-1.241604,-1.211588,-1.1188


In [19]:
### splitting the data into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(final_df, dependent_variable, test_size = 0.2, random_state = 42)

In [20]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((40000, 9), (10000, 9), (40000, 1), (10000, 1))

In [21]:
## creating function to check the performance of the model using different evaluation metrics

def evaluate_model(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    mae = mean_absolute_error(actual, predicted)
    r2score = r2_score(actual, predicted)
    root_mse = np.sqrt(mean_squared_error(actual, predicted))
    return mse, mae, r2score, root_mse

In [22]:
models = {
    "linear_regression" : LinearRegression(),
    "lasso" : Lasso(),
    "ridge" : Ridge(),
    'k_nearest_regressor' : KNeighborsRegressor(),
    'catboost' : CatBoostRegressor(verbose = False),
    'xgboost' : XGBRegressor(),
    'decision_tree' : DecisionTreeRegressor(),
    'random_forest' : RandomForestRegressor(),
    'adaboost' : AdaBoostRegressor(),
    'support_vector_reg' : SVR()
    
}

In [23]:
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    
    ## training the model
    model.fit(X_train, y_train)
    
    ## predicitions
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    ## checking the evaluation metrics
    _,_,model_train_r2score,_ = evaluate_model(y_train, y_train_pred)
    _,_,model_test_r2score,_ = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    
    print('r2_score of the model in the training dataset')
    print('R2_score : ' ,format(model_train_r2score))
    
    print('-------------------------------------------------------')
    
    print('r2_score of the model in the testing dataset')
    print('R2_score : ',format(model_test_r2score))
    
    r2_list.append(model_test_r2score)
    
    print('='*40)
    print('\n')

linear_regression
r2_score of the model in the training dataset
R2_score :  0.8847556046842044
-------------------------------------------------------
r2_score of the model in the testing dataset
R2_score :  0.8872658964238789


lasso
r2_score of the model in the training dataset
R2_score :  0.8847360430732293
-------------------------------------------------------
r2_score of the model in the testing dataset
R2_score :  0.8878609394078585


ridge
r2_score of the model in the training dataset
R2_score :  0.8847555585831294
-------------------------------------------------------
r2_score of the model in the testing dataset
R2_score :  0.8872780973022866


k_nearest_regressor
r2_score of the model in the training dataset
R2_score :  0.977006728271639
-------------------------------------------------------
r2_score of the model in the testing dataset
R2_score :  0.9642696798439947


catboost
r2_score of the model in the training dataset
R2_score :  0.9880282048337329
---------------------

In [35]:
final_report = pd.DataFrame(r2_list, model_list)

In [36]:
final_report

Unnamed: 0,0
linear_regression,0.887266
lasso,0.887861
ridge,0.887278
k_nearest_regressor,0.96427
catboost,0.983892
xgboost,0.982223
decision_tree,0.965265
random_forest,0.98148
adaboost,0.904155
support_vector_reg,0.56063


In [39]:
final_report[0].max()

0.9838919953640837