In [68]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline,Pipeline
from xgboost import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,median_absolute_error,mean_squared_log_error,r2_score,explained_variance_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [30]:
data = pd.read_csv('diamonds.csv')
data1 = data.copy()
data.head()

Unnamed: 0,rownames,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   rownames  53940 non-null  int64  
 1   carat     53940 non-null  float64
 2   cut       53940 non-null  object 
 3   color     53940 non-null  object 
 4   clarity   53940 non-null  object 
 5   depth     53940 non-null  float64
 6   table     53940 non-null  float64
 7   price     53940 non-null  int64  
 8   x         53940 non-null  float64
 9   y         53940 non-null  float64
 10  z         53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


In [32]:
# def R_column_extraction(data,threshold):
#     column = set()
#     cor_metrics = data.corr()
#     for i in range(len(data.columns)):
#         for j in range(i):
#             if abs(cor_metrics.iloc[i,j]) > threshold:
#                 result = data.columns[i]
#                 column.add(result)
#     return column
# R_column_extraction(data[numerical_features],0.8)

In [34]:
data = data.drop(columns=['rownames'])

In [35]:
numerical_features = data.describe().columns
categorical_features = [ col for col in data.columns if data[col].dtype in ['O']]
categorical_features1 = [ i for i in data.columns if i not in numerical_features]

In [38]:
ordinal_features = categorical_features

In [36]:
cut_map = {"Fair":0, "Good":1, "Very Good":2, "Premium":3, "Ideal":4}
color_map = {"D":6, "E":5, "F":4, "G":3, "H":2, "I":1, "J":0}
clarity_map = {"IF":7, "VVS1":6, "VVS2":5, "VS1":4, "VS2":3, "SI1":2, "SI2":1, "I1":0}

data['cut'] = data['cut'].map(cut_map)
data['color'] = data['color'].map(color_map)
data['clarity'] = data['clarity'].map(clarity_map)

In [41]:
x,y = data.drop(columns=['price']),data['price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=142)

In [50]:
num = x.columns
scalar = make_pipeline(MinMaxScaler())
preprocessors = make_column_transformer((scalar,num),remainder='passthrough')
lr = make_pipeline(preprocessors,LinearRegression(n_jobs=-1))
xgb = XGBRegressor()

In [62]:
# LinearRegression 
lr.fit(x_train,y_train)
lr

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline',
                                                  Pipeline(steps=[('minmaxscaler',
                                                                   MinMaxScaler())]),
                                                  Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))])),
                ('linearregression', LinearRegression(n_jobs=-1))])

In [63]:
lr_pred = lr.predict(x_test)

In [64]:
# prone to outlier (mean_squared_error may not work well here, you need check your domain knowledge)
lr_mse = mean_squared_error(y_test,lr_pred)
mse

1527757.338535055

In [75]:
lr_mae = mean_absolute_error(y_test,lr_pred)
lr_r2 = r2_score(y_test, lr_pred)
lr_medae = median_absolute_error(y_test, lr_pred)
lr_evs = explained_variance_score(y_test, lr_pred)
# lr_mlse = mean_squared_log_error(y_test, lr_pred)
# print('Mean Squared Log Error: {round(lr_mlse,4)}')

print(f"""Mean Squared Error:{round(lr_mse,2)}\nMean Absolute Error: {round(lr_mae,2)}
\nR2 Score: {round(lr_r2,2)}\nMedian Absolute Error: {round(lr_medae,2)}
\nExplained Variance Score: {round(lr_evs,2)}
""")


Mean Squared Error:1527757.34
Mean Absolute Error: 804.22

R2 Score: 0.91
Median Absolute Error: 571.11

Explained Variance Score: 0.91



In [65]:

xgb.fit(x_train,y_train)
xgb_pred = xgb.predict(x_test)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [71]:
x_mse = mean_squared_error(y_test, xgb_pred)
x_mae = mean_absolute_error(y_test, xgb_pred)
x_r2 = r2_score(y_test, xgb_pred)
x_medae = median_absolute_error(y_test, xgb_pred)
x_evs = explained_variance_score(y_test, xgb_pred)
x_mlse = mean_squared_log_error(y_test, xgb_pred)

print(f"""Mean Squared Error:{round(x_mse,2)}
\nMean Absolute Error: {round(x_mae,2)}\nR2 Score: {round(x_r2,2)}
\nMedian Absolute Error: {round(x_medae,2)}\nExplained Variance Score: {round(x_evs,2)}
\nMean Squared Log Error: {round(x_mlse,4)}""")

Mean Squared Error:295734.23

Mean Absolute Error: 273.22
R2 Score: 0.98

Median Absolute Error: 106.73
Explained Variance Score: 0.98

Mean Squared Log Error: 0.0089
