In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
diamonds_data = sns.load_dataset("diamonds")
diamonds_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
diamonds_data.isnull().mean()

carat      0.0
cut        0.0
color      0.0
clarity    0.0
depth      0.0
table      0.0
price      0.0
x          0.0
y          0.0
z          0.0
dtype: float64

In [10]:
cate = ["cut","color","clarity"]
numerical = diamonds_data.drop(cate,axis=1)
categorical = diamonds_data.filter(cate,axis=1)
numerical.head(), categorical.head()

(   carat  depth  table  price     x     y     z
 0   0.23   61.5   55.0    326  3.95  3.98  2.43
 1   0.21   59.8   61.0    326  3.89  3.84  2.31
 2   0.23   56.9   65.0    327  4.05  4.07  2.31
 3   0.29   62.4   58.0    334  4.20  4.23  2.63
 4   0.31   63.3   58.0    335  4.34  4.35  2.75,
        cut color clarity
 0    Ideal     E     SI2
 1  Premium     E     SI1
 2     Good     E     VS1
 3  Premium     I     VS2
 4     Good     J     SI2)

In [11]:
cuts = pd.get_dummies(categorical["cut"])
cuts.head()

Unnamed: 0,Ideal,Premium,Very Good,Good,Fair
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,0,1,0
3,0,1,0,0,0
4,0,0,0,1,0


In [12]:
colors = pd.get_dummies(categorical["color"])
clarities = pd.get_dummies(categorical["clarity"])

num_cate_data = pd.concat([cuts,colors,clarities],axis=1)
num_cate_data.head()

Unnamed: 0,Ideal,Premium,Very Good,Good,Fair,D,E,F,G,H,I,J,IF,VVS1,VVS2,VS1,VS2,SI1,SI2,I1
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [13]:
processed_data = pd.concat([numerical,num_cate_data],axis=1)
processed_data

Unnamed: 0,carat,depth,table,price,x,y,z,Ideal,Premium,Very Good,...,I,J,IF,VVS1,VVS2,VS1,VS2,SI1,SI2,I1
0,0.23,61.5,55.0,326,3.95,3.98,2.43,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0.21,59.8,61.0,326,3.89,3.84,2.31,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,0.23,56.9,65.0,327,4.05,4.07,2.31,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.29,62.4,58.0,334,4.20,4.23,2.63,0,1,0,...,1,0,0,0,0,0,1,0,0,0
4,0.31,63.3,58.0,335,4.34,4.35,2.75,0,0,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,1,0,0,...,0,0,0,0,0,0,0,1,0,0
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,0,0,0,...,0,0,0,0,0,0,0,1,0,0
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,0,0,1,...,0,0,0,0,0,0,0,1,0,0
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [30]:
processed_data.var()

carat        2.246867e-01
depth        2.052404e+00
table        4.992948e+00
price        1.591563e+07
x            1.258347e+00
y            1.304472e+00
z            4.980109e-01
Ideal        2.399115e-01
Premium      1.903078e-01
Very Good    1.738215e-01
Good         8.268201e-02
Fair         2.895761e-02
D            1.098286e-01
E            1.486419e-01
F            1.456093e-01
G            1.655220e-01
H            1.302510e-01
I            9.041668e-02
J            4.934874e-02
IF           3.208437e-02
VVS1         6.317016e-02
VVS2         8.509994e-02
VS1          1.285384e-01
VS2          1.756121e-01
SI1          1.835496e-01
SI2          1.413985e-01
I1           1.354902e-02
dtype: float64

In [14]:
X = processed_data.drop(["price"],axis=1)
y = processed_data["price"]
X.head(),y.head()

(   carat  depth  table     x     y     z  Ideal  Premium  Very Good  Good  \
 0   0.23   61.5   55.0  3.95  3.98  2.43      1        0          0     0   
 1   0.21   59.8   61.0  3.89  3.84  2.31      0        1          0     0   
 2   0.23   56.9   65.0  4.05  4.07  2.31      0        0          0     1   
 3   0.29   62.4   58.0  4.20  4.23  2.63      0        1          0     0   
 4   0.31   63.3   58.0  4.34  4.35  2.75      0        0          0     1   
 
    ...  I  J  IF  VVS1  VVS2  VS1  VS2  SI1  SI2  I1  
 0  ...  0  0   0     0     0    0    0    0    1   0  
 1  ...  0  0   0     0     0    0    0    1    0   0  
 2  ...  0  0   0     0     0    1    0    0    0   0  
 3  ...  1  0   0     0     0    0    1    0    0   0  
 4  ...  0  1   0     0     0    0    0    0    1   0  
 
 [5 rows x 26 columns],
 0    326
 1    326
 2    327
 3    334
 4    335
 Name: price, dtype: int64)

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((43152, 26), (10788, 26), (43152,), (10788,))

In [25]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

rfr_reg = RandomForestRegressor(random_state=42,n_estimators=1000)

regressor = rfr_reg.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

print("MAE: ",metrics.mean_absolute_error(y_test,y_pred))
print("MSE: ",metrics.mean_squared_error(y_test,y_pred))
print("RMSE: ",np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

MAE:  265.6812693107347
MSE:  295657.8641195591
RMSE:  543.7443003099519


In [29]:
single_record = ss.transform(X_test[50].reshape(1,-1))
predicted_val = regressor.predict(single_record)
predicted_val,y_test[50]



(array([590.611]), 404)