In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

warnings.filterwarnings('ignore')

# LIMPIEZA

In [5]:
data = pd.read_csv('./INPUT/diamonds_train.csv')
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475
3,3,1.04,Ideal,E,VVS2,62.0,58.0,6.54,6.46,4.03,9552
4,4,0.65,Ideal,J,SI1,61.4,55.0,5.58,5.62,3.44,1276


In [6]:
# Nos quitamos el id
data = data.iloc[:,1:]
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.78,Premium,F,VS1,61.5,58.0,5.93,5.98,3.66,3446
1,0.31,Ideal,D,SI1,60.8,56.0,4.37,4.32,2.64,732
2,0.3,Ideal,F,SI1,62.3,54.0,4.3,4.34,2.69,475
3,1.04,Ideal,E,VVS2,62.0,58.0,6.54,6.46,4.03,9552
4,0.65,Ideal,J,SI1,61.4,55.0,5.58,5.62,3.44,1276


In [7]:
''' Vemos que en las columnas de medidas (x,y,z) los valores mínimos son 0, por tanto, son valores nulos que nos
desvirtúan las métricas.'''
'''Además, en la medida y el valor máximo es 58.9, que comparado con la media parece traterse de un error'''
data.describe()

Unnamed: 0,carat,depth,table,x,y,z,price
count,40345.0,40345.0,40345.0,40345.0,40345.0,40345.0,40345.0
mean,0.797699,61.749721,57.449421,5.730824,5.734808,3.537958,3931.902243
std,0.474309,1.427728,2.234087,1.121767,1.151536,0.695039,3994.318832
min,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,952.0
50%,0.7,61.8,57.0,5.69,5.71,3.52,2397.0
75%,1.04,62.5,59.0,6.54,6.54,4.03,5317.0
max,5.01,79.0,95.0,10.74,58.9,8.06,18823.0


In [8]:
data.corr()['price']

carat    0.922345
depth   -0.013307
table    0.126545
x        0.886168
y        0.860499
z        0.876061
price    1.000000
Name: price, dtype: float64

In [9]:
# Teniendo en cuenta lo anterior, quitamos los valores nulos
data = data.loc[(data['x'] != 0) & (data['y'] != 0) & (data['z'] != 0)]
data = data.loc[(data['y'] < 15)]

In [10]:
# Sacamos lista ordenada del tipo de corte del peor al mejor
cut_list = list(data.cut.value_counts().keys())
cut_list.reverse()
# Aplicamos el valor numérico para que lo valore en función de la importancia del corte
data['cut'] = data['cut'].apply(lambda x: cut_list.index(x))

In [11]:
# Sacamos lista ordenada del color del peor al mejor
color_list = sorted(list(data.color.value_counts().keys()),reverse=True)
# Aplicamos el valor numérico para que lo valore en función de la importancia del color
data['color'] = data['color'].apply(lambda x: color_list.index(x))

In [12]:
# Sacamos lista ordenada de la claridad de peor a mejor
clarity_list = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
# Aplicamos el valor numérico para que lo valore en función de la importancia de la claridad
data['clarity'] = data['clarity'].apply(lambda x: clarity_list.index(x))

In [13]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.78,3,4,4,61.5,58.0,5.93,5.98,3.66,3446
1,0.31,4,6,2,60.8,56.0,4.37,4.32,2.64,732
2,0.3,4,4,2,62.3,54.0,4.3,4.34,2.69,475
3,1.04,4,5,5,62.0,58.0,6.54,6.46,4.03,9552
4,0.65,4,0,2,61.4,55.0,5.58,5.62,3.44,1276


In [24]:
# Asignamos las features a la X
X = data.drop(columns='price')
# Asignamos el precio a predecir a la y
y = data['price']

# ENTRENAMIENTO MODELO

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40,random_state = 42)

In [26]:
# Entrenamos el modelo
model = HistGradientBoostingRegressor()
data_trained = model.fit(X_train, y_train)

In [27]:
# Lo aplicamos para hacer la predicción
pred = data_trained.predict(X_test)

In [28]:
# Veo comparativa en una tabla
compare = pd.DataFrame({'Reality':y_test, 'Predicted':pred})
compare.head(10)

Unnamed: 0,Reality,Predicted
7309,666,735.239639
33618,9193,9822.76011
3721,1875,1912.888348
28360,4234,4649.740911
28098,1404,1204.206953
15443,579,632.30947
10426,7164,6453.349089
2352,2960,2793.77118
15469,3478,3214.298971
16485,1087,970.747587


In [29]:
# La diferencia media que hay entre la realidad y la predicción
1-np.abs(y_test-pred).sum()/len(y_test)

-285.9484090138578

In [30]:
# Cuanto de similares de media son los datos reales vs predichos
rms = sqrt(mean_squared_error(y_test, pred))
rms

559.0624045747408

# PREDICCION

In [31]:
# ENTRENAMIENTO SOBRE EL 100% DE LOS DATOS
data_trained = model.fit(X, y)

In [100]:
data2 = pd.read_csv('diamonds_test.csv')
data2.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.1,Premium,H,SI2,62.2,58.0,6.69,6.6,4.13
1,1,0.51,Ideal,I,SI1,62.5,57.0,5.07,5.1,3.18
2,2,2.03,Premium,G,SI1,61.9,59.0,8.14,8.09,5.02
3,3,1.21,Premium,F,SI1,60.0,60.0,6.96,6.91,4.16
4,4,0.55,Ideal,F,SI1,61.8,55.0,5.27,5.22,3.24


In [101]:
# Nos quitamos el id
data2 = data2.iloc[:,1:]
data2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.1,Premium,H,SI2,62.2,58.0,6.69,6.6,4.13
1,0.51,Ideal,I,SI1,62.5,57.0,5.07,5.1,3.18
2,2.03,Premium,G,SI1,61.9,59.0,8.14,8.09,5.02
3,1.21,Premium,F,SI1,60.0,60.0,6.96,6.91,4.16
4,0.55,Ideal,F,SI1,61.8,55.0,5.27,5.22,3.24


In [102]:
# Sacamos lista ordenada del tipo de corte del peor al mejor
cut_list = list(data2.cut.value_counts().keys())
cut_list.reverse()
# Aplicamos el valor numérico para que lo valore en función de la importancia del corte
data2['cut'] = data2['cut'].apply(lambda x: cut_list.index(x))

In [103]:
# Sacamos lista ordenada del color del peor al mejor
color_list = sorted(list(data2.color.value_counts().keys()),reverse=True)
# Aplicamos el valor numérico para que lo valore en función de la importancia del color
data2['color'] = data2['color'].apply(lambda x: color_list.index(x))

In [104]:
# Sacamos lista ordenada de la claridad de peor a mejor
clarity_list = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
# Aplicamos el valor numérico para que lo valore en función de la importancia de la claridad
data2['clarity'] = data2['clarity'].apply(lambda x: clarity_list.index(x))

In [105]:
data2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.1,3,2,1,62.2,58.0,6.69,6.6,4.13
1,0.51,4,1,2,62.5,57.0,5.07,5.1,3.18
2,2.03,3,3,2,61.9,59.0,8.14,8.09,5.02
3,1.21,3,4,2,60.0,60.0,6.96,6.91,4.16
4,0.55,4,4,2,61.8,55.0,5.27,5.22,3.24


In [106]:
pred = data_trained.predict(data2)

In [107]:
data2['price'] = pred
data2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.1,3,2,1,62.2,58.0,6.69,6.6,4.13,4318.262779
1,0.51,4,1,2,62.5,57.0,5.07,5.1,3.18,1019.309026
2,2.03,3,3,2,61.9,59.0,8.14,8.09,5.02,16712.06471
3,1.21,3,4,2,60.0,60.0,6.96,6.91,4.16,7183.272536
4,0.55,4,4,2,61.8,55.0,5.27,5.22,3.24,1529.532663


In [108]:
copia = data2.copy()

In [109]:
copia['price'] = copia['price'].apply(lambda x: int(x))

In [110]:
copia.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.1,3,2,1,62.2,58.0,6.69,6.6,4.13,4318
1,0.51,4,1,2,62.5,57.0,5.07,5.1,3.18,1019
2,2.03,3,3,2,61.9,59.0,8.14,8.09,5.02,16712
3,1.21,3,4,2,60.0,60.0,6.96,6.91,4.16,7183
4,0.55,4,4,2,61.8,55.0,5.27,5.22,3.24,1529


In [111]:
copia['id'] = copia.index
submit = copia[['id','price']]
submit.head()

Unnamed: 0,id,price
0,0,4318
1,1,1019
2,2,16712
3,3,7183
4,4,1529


In [81]:
submit.to_csv('result3.csv',index=False)