In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
# Load the dataset
df = sns.load_dataset('diamonds').sample(n=20000, random_state=42)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1388,0.24,Ideal,G,VVS1,62.1,56.0,559,3.97,4.0,2.47
50052,0.58,Very Good,F,VVS2,60.0,57.0,2201,5.44,5.42,3.26
41645,0.4,Ideal,E,VVS2,62.1,55.0,1238,4.76,4.74,2.95
42377,0.43,Premium,E,VVS2,60.8,57.0,1304,4.92,4.89,2.98
17244,1.55,Ideal,E,SI2,62.3,55.0,6901,7.44,7.37,4.61


In [10]:
# Informatoion about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 1388 to 42680
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    20000 non-null  float64 
 1   cut      20000 non-null  category
 2   color    20000 non-null  category
 3   clarity  20000 non-null  category
 4   depth    20000 non-null  float64 
 5   table    20000 non-null  float64 
 6   price    20000 non-null  int64   
 7   x        20000 non-null  float64 
 8   y        20000 non-null  float64 
 9   z        20000 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 1.3 MB


In [11]:
# Check missing values
df.isna().sum().sort_values(ascending=False)

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [12]:
# Summary 
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.796141,61.752835,57.443465,3908.5017,5.727281,5.730017,3.536041
std,0.471719,1.431536,2.260135,3958.733294,1.122634,1.129743,0.694116
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.1,56.0,942.0,4.7075,4.71,2.91
50%,0.7,61.9,57.0,2409.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5290.25,6.54,6.54,4.03
max,4.01,79.0,95.0,18823.0,10.14,31.8,6.31


In [13]:
# Check columns
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

# Regression Tasks

In [15]:
# Select features and variables
X = df.drop('price', axis=1)
y = df['price']

# label encode categorical variables
le = LabelEncoder()
X['cut'] = le.fit_transform(X['cut'])
X['color'] = le.fit_transform(X['color'])
X['clarity'] = le.fit_transform(X['clarity'])

In [16]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}") 

Mean Absolute error for XGBRegressor is  296.37
Mean Absolute error for RandomForestRegressor is  297.55
Mean Absolute error for GradientBoostingRegressor is  372.60
Mean Absolute error for DecisionTreeRegressor is  407.51
Mean Absolute error for KNeighborsRegressor is  548.41
Mean Absolute error for LinearRegression is  871.90
Mean Absolute error for SVR is  2792.48
CPU times: total: 1min 14s
Wall time: 1min 27s


In [17]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print('R_squared Score', f"{model[0]} is {model[1]: .2f}")

R_squared Score XGBRegressor is  0.98
R_squared Score RandomForestRegressor is  0.98
R_squared Score GradientBoostingRegressor is  0.97
R_squared Score DecisionTreeRegressor is  0.96
R_squared Score KNeighborsRegressor is  0.94
R_squared Score LinearRegression is  0.88
R_squared Score SVR is -0.14
CPU times: total: 1min 14s
Wall time: 1min 25s


In [18]:
%%time
# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a dictionaries of list of models to evaluate performance
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Squared error for', f"{model[0]} is {model[1]: .2f}")

Mean Squared error for XGBRegressor is  333000.19
Mean Squared error for RandomForestRegressor is  383723.81
Mean Squared error for GradientBoostingRegressor is  426418.22
Mean Squared error for DecisionTreeRegressor is  683956.11
Mean Squared error for KNeighborsRegressor is  1010628.54
Mean Squared error for LinearRegression is  1930982.14
Mean Squared error for SVR is  18174211.26
CPU times: total: 1min 14s
Wall time: 1min 24s


# Hyperparameter tuning:

In [19]:
%%time
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  1930982.1376738544
LinearRegression R2:  0.8791747374834002
LinearRegression MAE:  871.8983991020539


SVR MSE:  17829319.623301506
SVR R2:  -0.11561478583786955
SVR MAE:  2750.0496730271234


DecisionTreeRegressor MSE:  494750.4880003549
DecisionTreeRegressor R2:  0.9690425113590795
DecisionTreeRegressor MAE:  368.5889869029934


RandomForestRegressor MSE:  386456.64805295045
RandomForestRegressor R2:  0.9758186649988735
RandomForestRegressor MAE:  296.64838968055557


KNeighborsRegressor MSE:  972756.5133367346
KNeighborsRegressor R2:  0.9391327559196226
KNeighborsRegressor MAE:  535.442


GradientBoostingRegressor MSE:  424542.10596448864
GradientBoostingRegressor R2:  0.973435584720477
GradientBoostingRegressor MAE:  372.49690982308215


XGBRegressor MSE:  333000.192452789
XGBRegressor R2:  0.9791635381362693
XGBRegressor MAE:  296.3738429412842


CPU times: total: 11min 5s
Wall time: 12min 43s


In [20]:
%%time
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  1930982.1376738544
LinearRegression R2:  0.8791747374834002
LinearRegression MAE:  871.8983991020539




# Isy abi Check kar ke thek karna he Phr finalize hogi