In [41]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
import xgboost as xgb

warnings.filterwarnings('ignore')

In [42]:
data = pd.read_csv('diamonds_train.csv')
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475
3,3,1.04,Ideal,E,VVS2,62.0,58.0,6.54,6.46,4.03,9552
4,4,0.65,Ideal,J,SI1,61.4,55.0,5.58,5.62,3.44,1276


In [43]:
# Nos quitamos el id
data = data.iloc[:,1:]
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475
3,1.04,Ideal,E,VVS2,62.0,58.0,6.54,6.46,4.03,9552
4,0.65,Ideal,J,SI1,61.4,55.0,5.58,5.62,3.44,1276


In [44]:
# Sacamos lista ordenada del tipo de corte del peor al mejor
cut_list = list(data.cut.value_counts().keys())
cut_list.reverse()
# Aplicamos el valor numérico para que lo valore en función de la importancia del corte
data['cut'] = data['cut'].apply(lambda x: cut_list.index(x))

In [45]:
# Sacamos lista ordenada del color del peor al mejor
color_list = sorted(list(data.color.value_counts().keys()),reverse=True)
# Aplicamos el valor numérico para que lo valore en función de la importancia del color
data['color'] = data['color'].apply(lambda x: color_list.index(x))

In [46]:
# Sacamos lista ordenada de la claridad de peor a mejor
clarity_list = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
# Aplicamos el valor numérico para que lo valore en función de la importancia de la claridad
data['clarity'] = data['clarity'].apply(lambda x: clarity_list.index(x))

In [47]:
data.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
count,40345.0,40345.0,40345.0,40345.0,40345.0,40345.0,40345.0,40345.0,40345.0,40345.0
mean,0.797699,2.905936,3.406519,3.055794,61.749721,57.449421,5.730824,5.734808,3.537958,3931.902243
std,0.474309,1.115415,1.700667,1.651013,1.427728,2.234087,1.121767,1.151536,0.695039,3994.318832
min,0.2,0.0,0.0,0.0,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,2.0,2.0,2.0,61.0,56.0,4.71,4.72,2.91,952.0
50%,0.7,3.0,3.0,3.0,61.8,57.0,5.69,5.71,3.52,2397.0
75%,1.04,4.0,5.0,4.0,62.5,59.0,6.54,6.54,4.03,5317.0
max,5.01,4.0,6.0,7.0,79.0,95.0,10.74,58.9,8.06,18823.0


In [48]:
data = data.loc[(data['x'] != 0) & (data['y'] != 0) & (data['z'] != 0)] 

In [49]:
data = data.loc[(data['y'] < 15)]

In [50]:
data.corr()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
carat,1.0,-0.132682,-0.291301,-0.352029,0.023301,0.181782,0.977937,0.977088,0.976466,0.922324
cut,-0.132682,1.0,0.018839,0.187315,-0.213116,-0.431824,-0.124958,-0.124542,-0.150496,-0.053028
color,-0.291301,0.018839,1.0,-0.031485,-0.044324,-0.026522,-0.271204,-0.27091,-0.275009,-0.17378
clarity,-0.352029,0.187315,-0.031485,1.0,-0.063883,-0.159905,-0.372053,-0.366932,-0.375473,-0.148939
depth,0.023301,-0.213116,-0.044324,-0.063883,1.0,-0.299474,-0.028792,-0.0318,0.092602,-0.013254
table,0.181782,-0.431824,-0.026522,-0.159905,-0.299474,1.0,0.196468,0.190443,0.156245,0.126738
x,0.977937,-0.124958,-0.271204,-0.372053,-0.028792,0.196468,1.0,0.998781,0.991142,0.887715
y,0.977088,-0.124542,-0.27091,-0.366932,-0.0318,0.190443,0.998781,1.0,0.990813,0.889288
z,0.976466,-0.150496,-0.275009,-0.375473,0.092602,0.156245,0.991142,0.990813,1.0,0.882774
price,0.922324,-0.053028,-0.17378,-0.148939,-0.013254,0.126738,0.887715,0.889288,0.882774,1.0


In [51]:
# Asignamos las features a la X
X = data.iloc[:,[0,1,2,3,4,5,6,7,8]]
# Asignamos el precio a predecir a la y
y = data.price

In [52]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.78,3,4,4,61.5,58.0,5.93,5.98,3.66
1,0.31,4,6,2,60.8,56.0,4.37,4.32,2.64
2,0.3,4,4,2,62.3,54.0,4.3,4.34,2.69
3,1.04,4,5,5,62.0,58.0,6.54,6.46,4.03
4,0.65,4,0,2,61.4,55.0,5.58,5.62,3.44


In [60]:
# Dividimos los datos en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [73]:
# Entrenamos el modelo
model = xgb.XGBRegressor(n_estimators=250)
data_trained = model.fit(X_train, y_train)

In [74]:
# Lo aplicamos para hacer la predicción
pred = data_trained.predict(X_test)

In [75]:
# Lo cerca que están los datos de la línea de regresión
r2_score(y_test, pred)

0.9798254988673117

In [76]:
# Veo comparativa en una tabla
compare = pd.DataFrame({'Reality':y_test, 'Predicted':pred})
compare.head(10)

Unnamed: 0,Reality,Predicted
7309,666,674.455627
33618,9193,9304.523438
3721,1875,1832.782471
28360,4234,4748.692871
28098,1404,1174.615234
15443,579,592.860291
10426,7164,6347.548828
2352,2960,2790.723389
15469,3478,3199.95874
16485,1087,957.344177


In [71]:
# La diferencia media que hay entre la realidad y la predicción
1-np.abs(y_test-pred).sum()/len(y_test)

-286.982136403601

In [72]:
# Cuanto de similares de media son los datos reales vs predichos
rms = sqrt(mean_squared_error(y_test, pred))
rms

566.2365182884488

In [None]:
# No mejora