In [2]:
import numpy as np
import pandas as pd
from tabulate import tabulate

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge

from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence

# Varios
# ==============================================================================
import multiprocessing
import random
from itertools import product
from fitter import Fitter, get_common_distributions

In [3]:
diamonds_train = pd.read_csv('db/diamonds_train.csv')
diamonds_test = pd.read_csv('db/diamonds_test.csv') 

In [4]:
print("Cut: ",set(diamonds_train["cut"]))
print("Color: ",set(diamonds_train["color"]))
print("Clarity: ",set(diamonds_train["clarity"]))

Cut:  {'Ideal', 'Good', 'Very Good', 'Premium', 'Fair'}
Color:  {'J', 'F', 'I', 'D', 'E', 'H', 'G'}
Clarity:  {'VS2', 'VVS2', 'SI1', 'VS1', 'SI2', 'VVS1', 'I1', 'IF'}


In [5]:
diamonds_train['price/wt']=diamonds_train['price']/diamonds_train['carat']
print(diamonds_train.groupby('cut')['price/wt'].mean().sort_values())
print(diamonds_train.groupby('color')['price/wt'].mean().sort_values())
print(diamonds_train.groupby('clarity')['price/wt'].mean().sort_values())
diamonds_train = diamonds_train.drop(['price/wt','table'], axis=1)

cut
Fair         3757.492344
Good         3832.597411
Ideal        3908.044496
Very Good    4014.378029
Premium      4231.131397
Name: price/wt, dtype: float64
color
E    3810.101878
J    3827.670485
D    3937.805478
I    3980.836098
H    4006.343705
F    4106.324111
G    4171.555393
Name: price/wt, dtype: float64
clarity
I1      2798.055592
SI1     3844.657269
VVS1    3882.807337
SI2     4025.614743
VS2     4074.905291
VS1     4129.189881
VVS2    4178.017868
IF      4217.324475
Name: price/wt, dtype: float64


In [6]:
diamonds_train['cut']=diamonds_train['cut'].map({'Ideal':5,'Good':2,'Very Good':3,'Fair':1,'Premium':4})
diamonds_train['color']=diamonds_train['color'].map({'E':2,'D':1,'F':3,'G':4,'H':5,'I':6,'J':7})
diamonds_train['clarity']=diamonds_train['clarity'].map({'VVS1':7,'IF':8,'VVS2':6,'VS1':5,'I1':1,'VS2':4,'SI1':3,'SI2':2})

In [7]:
diamonds_train.corr()

Unnamed: 0,carat,cut,color,clarity,depth,price,x,y,z
carat,1.0,-0.136154,0.294027,-0.357279,0.026528,0.921935,0.975688,0.951667,0.96757
cut,-0.136154,1.0,-0.024355,0.192034,-0.226255,-0.0538,-0.126509,-0.122351,-0.152732
color,0.294027,-0.024355,1.0,0.02075,0.047988,0.174855,0.272498,0.265611,0.275022
clarity,-0.357279,0.192034,0.02075,1.0,-0.071775,-0.151893,-0.375764,-0.362133,-0.376815
depth,0.026528,-0.226255,0.047988,-0.071775,1.0,-0.014864,-0.026348,-0.030966,0.094655
price,0.921935,-0.0538,0.174855,-0.151893,-0.014864,1.0,0.885848,0.866163,0.8745
x,0.975688,-0.126509,0.272498,-0.375764,-0.026348,0.885848,1.0,0.973712,0.984876
y,0.951667,-0.122351,0.265611,-0.362133,-0.030966,0.866163,0.973712,1.0,0.964828
z,0.96757,-0.152732,0.275022,-0.376815,0.094655,0.8745,0.984876,0.964828,1.0


In [8]:
diamonds_train['cut/wt']=diamonds_train['cut']/diamonds_train['carat']
diamonds_train['color/wt']=diamonds_train['color']/diamonds_train['carat']
diamonds_train['clarity/wt']=diamonds_train['clarity']/diamonds_train['carat']
diamonds_train = diamonds_train.drop(['cut','color','clarity','depth','city','x','y','z'], axis=1)

In [9]:
diamonds_train

Unnamed: 0,carat,price,cut/wt,color/wt,clarity/wt
0,1.21,4268,3.305785,5.785124,3.305785
1,1.02,3513,3.921569,6.862745,3.921569
2,0.77,1792,5.194805,9.090909,5.194805
3,1.51,7553,2.649007,4.635762,2.649007
4,0.57,1176,7.017544,12.280702,7.017544
...,...,...,...,...,...
40450,0.54,2729,9.259259,5.555556,14.814815
40451,0.53,2802,9.433962,5.660377,15.094340
40452,0.30,886,16.666667,10.000000,26.666667
40453,0.26,768,19.230769,11.538462,30.769231


In [10]:
X=diamonds_train.drop(['price'],axis=1)
Y=diamonds_train['price']
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.15,random_state=42)

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor(random_state = 0)
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)

In [13]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
mae = mean_absolute_error(Y_test,y_pred)
print("mae: %f" %(mae))
Rsquare=regressor.score(X_test,Y_test)
print("Rsquare: %f" %(Rsquare))
rmse=np.sqrt(mean_squared_error(Y_test,y_pred))
print("rmse: %f" %(rmse))

mae: 359.854976
Rsquare: 0.976049
rmse: 631.545708


In [14]:
diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [15]:
diamonds_test['cut']=diamonds_test['cut'].map({'Ideal':1,'Good':2,'Very Good':3,'Fair':4,'Premium':5})
diamonds_test['color']=diamonds_test['color'].map({'E':1,'D':2,'F':3,'G':4,'H':5,'I':6,'J':7})
diamonds_test['clarity']=diamonds_test['clarity'].map({'VVS1':1,'IF':2,'VVS2':3,'VS1':4,'I1':5,'VS2':6,'SI1':7,'SI2':8})

In [16]:
diamonds_test.corr()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
id,1.0,-0.002578,0.000441,-0.014124,0.001262,-0.002425,0.004891,-0.000102,-0.002206,0.000456
carat,-0.002578,1.0,0.148725,0.285265,0.312146,0.033385,0.176245,0.973283,0.9519,0.913598
cut,0.000441,0.148725,1.0,0.02901,0.157565,-0.092005,0.499796,0.154065,0.13465,0.127239
color,-0.014124,0.285265,0.02901,1.0,-0.031492,0.042913,0.015522,0.266377,0.259063,0.249694
clarity,0.001262,0.312146,0.157565,-0.031492,1.0,0.039044,0.159626,0.336087,0.324029,0.315115
depth,-0.002425,0.033385,-0.092005,0.042913,0.039044,1.0,-0.303653,-0.022049,-0.024359,0.095831
table,0.004891,0.176245,0.499796,0.015522,0.159626,-0.303653,1.0,0.19313,0.180943,0.138777
x,-0.000102,0.973283,0.154065,0.266377,0.336087,-0.022049,0.19313,1.0,0.977762,0.931211
y,-0.002206,0.9519,0.13465,0.259063,0.324029,-0.024359,0.180943,0.977762,1.0,0.916161
z,0.000456,0.913598,0.127239,0.249694,0.315115,0.095831,0.138777,0.931211,0.916161,1.0


In [17]:
diamonds_test['cut/wt']=diamonds_test['cut']/diamonds_test['carat']
diamonds_test['color/wt']=diamonds_test['color']/diamonds_test['carat']
diamonds_test['clarity/wt']=diamonds_test['clarity']/diamonds_test['carat']
diamonds_test = diamonds_test.drop(['cut','color','clarity','table','depth','city','x','y','z'], axis=1)

In [18]:
diamonds_test_x = diamonds_test.drop(['id'],axis=1)

In [21]:
from sklearn.ensemble import GradientBoostingRegressor
regressor1 = GradientBoostingRegressor(random_state = 0)
regressor1.fit(diamonds_train.drop(['price'],axis=1), diamonds_train['price'])
y_pred1 = regressor1.predict(diamonds_test_x)

In [22]:
X_train

Unnamed: 0,carat,cut/wt,color/wt,clarity/wt
21790,0.52,5.769231,7.692308,3.846154
23850,1.06,4.716981,5.660377,2.830189
9598,0.53,3.773585,3.773585,5.660377
23316,0.50,8.000000,6.000000,8.000000
35017,1.03,2.912621,2.912621,1.941748
...,...,...,...,...
6265,1.68,2.976190,2.380952,1.785714
11284,1.00,3.000000,3.000000,5.000000
38158,0.43,11.627907,6.976744,6.976744
860,0.82,3.658537,2.439024,3.658537


In [23]:
y_pred1

array([ 3566.65490956,  5016.56472286, 12710.40840999, ...,
        2326.94025643,  2921.00682767,   595.53943515])