In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
from tabulate import tabulate

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge

from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence

# Varios
# ==============================================================================
import multiprocessing
import random
from itertools import product
from fitter import Fitter, get_common_distributions

In [2]:
diamonds_original_withoutcity = pd.read_csv('db/diamonds_train.csv')
diamonds_test_withoutcity = pd.read_csv('db/diamonds_test.csv') 

In [3]:
diamonds_original_withoutcity

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,city
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,Dubai
1,1.02,Premium,J,VS2,61.6,58.0,3513,6.40,6.35,3.93,Dubai
2,0.77,Premium,J,VS2,62.3,58.0,1792,5.86,5.80,3.63,Dubai
3,1.51,Premium,J,VS2,59.6,60.0,7553,7.58,7.48,4.49,Dubai
4,0.57,Premium,J,VS2,60.2,62.0,1176,5.40,5.33,3.23,Dubai
...,...,...,...,...,...,...,...,...,...,...,...
40450,0.54,Ideal,F,IF,62.2,54.0,2729,5.24,5.27,3.27,Surat
40451,0.53,Ideal,F,IF,61.9,54.0,2802,5.22,5.25,3.24,Surat
40452,0.30,Ideal,F,IF,62.3,55.0,886,4.30,4.34,2.69,Surat
40453,0.26,Ideal,F,IF,60.9,55.0,768,4.15,4.23,2.55,Surat


In [4]:
diamonds_original_withoutcity.drop(columns=['city','depth','table','x','y','z'], inplace=True, axis=1)

In [5]:
diamonds_test_withoutcity.drop(columns=['city','depth','table','x','y','z'], inplace=True, axis=1)

In [6]:
diamonds_original_withoutcity.cut.unique()

array(['Premium', 'Very Good', 'Fair', 'Good', 'Ideal'], dtype=object)

In [7]:
diamonds_original_withoutcity.color.unique()

array(['J', 'E', 'I', 'G', 'D', 'H', 'F'], dtype=object)

In [8]:
diamonds_original_withoutcity.clarity.unique()

array(['VS2', 'VVS2', 'SI1', 'VS1', 'SI2', 'I1', 'VVS1', 'IF'],
      dtype=object)

In [9]:
diamonds_with_cuts = diamonds_original_withoutcity.cut.map({'Fair':0.20, 'Good':0.40,'Very Good':0.60, 'Premium':0.80,'Ideal':1})
diamonds_with_colors = diamonds_original_withoutcity.color.map({'J':0.14, 'I':0.28,'H':0.43, 'G':0.57,'F':0.71,'E':0.86, 'D':1})
diamonds_with_clarity = diamonds_original_withoutcity.clarity.map({'I1':0.20, 'SI1':0.40,'SI2':0.40, 'VS1':0.6,'VS2':0.6,'VVS1':0.80,'VVS2':0.80,'IF':1})


diamonds_test_with_cuts = diamonds_test_withoutcity.cut.map({'Fair':0.20, 'Good':0.40,'Very Good':0.60, 'Premium':0.80,'Ideal':1})
diamonds_test_with_colors = diamonds_test_withoutcity.color.map({'J':0.14, 'I':0.28,'H':0.43, 'G':0.57,'F':0.71,'E':0.86, 'D':1})
diamonds_test_with_clarity = diamonds_test_withoutcity.clarity.map({'I1':0.20, 'SI1':0.40,'SI2':0.40, 'VS1':0.6,'VS2':0.6,'VVS1':0.80,'VVS2':0.80,'IF':1})

In [10]:
diamonds_original_withoutcity['clarity'] = diamonds_with_clarity
diamonds_test_withoutcity['clarity'] = diamonds_test_with_clarity

In [11]:
diamonds_original_withoutcity['cut'] = diamonds_with_cuts
diamonds_test_withoutcity['cut'] = diamonds_test_with_cuts

In [12]:
diamonds_original_withoutcity['color'] = diamonds_with_colors
diamonds_test_withoutcity['color'] = diamonds_test_with_colors

In [13]:
target = "price"
cat_features = ['cut','color','clarity']
num_features = ['carat']

features = cat_features + num_features

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
diamonds_scaled = scaler.fit_transform(diamonds_original_withoutcity.loc[:,features].values)
diamonds_test_scaled = scaler.fit_transform(diamonds_test_withoutcity.loc[:,features].values)

In [15]:
diamond_scaler_df = pd.DataFrame(diamonds_scaled, columns=["carat","cut","color","clarity"])
diamond_test_scaler_df = pd.DataFrame(diamonds_test_scaled, columns=["carat","cut","color","clarity"])

In [16]:
diamond_scaler_df

Unnamed: 0,carat,cut,color,clarity
0,0.75,0.000000,0.5,0.234884
1,0.75,0.000000,0.5,0.190698
2,0.75,0.000000,0.5,0.132558
3,0.75,0.000000,0.5,0.304651
4,0.75,0.000000,0.5,0.086047
...,...,...,...,...
40450,1.00,0.662791,1.0,0.079070
40451,1.00,0.662791,1.0,0.076744
40452,1.00,0.662791,1.0,0.023256
40453,1.00,0.662791,1.0,0.013953


In [17]:
diamond_test_scaler_df

Unnamed: 0,carat,cut,color,clarity
0,0.50,0.662791,0.25,0.122661
1,1.00,0.000000,0.50,0.207900
2,0.75,0.337209,0.25,0.284823
3,0.50,0.662791,0.25,0.145530
4,0.50,0.662791,0.50,0.062370
...,...,...,...,...
13480,1.00,0.837209,0.25,0.076923
13481,1.00,0.162791,0.50,0.106029
13482,1.00,0.662791,0.50,0.103950
13483,0.50,0.662791,0.25,0.103950


In [18]:
#cols = ['carat', 'x','y','z']

#Q1 = train_df[cols].quantile(0.25)
#Q3 = train_df[cols].quantile(0.75)
#IQR = Q3 - Q1

#train_df_clean = train_df[~((train_df[cols] < (Q1 - 1.5 * IQR)) |(train_df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
#train_df_clean

In [19]:
diamond_scaler_df['carat'] *= 92
diamond_scaler_df['color'] *= 4
diamond_scaler_df['clarity'] *= 3.5
diamond_scaler_df['cut'] *= 0.5

In [20]:
diamond_test_scaler_df['carat'] *= 92
diamond_test_scaler_df['color'] *= 4
diamond_test_scaler_df['clarity'] *= 3.5
diamond_test_scaler_df['cut'] *= 0.5

In [21]:
diamond_scaler_df

Unnamed: 0,carat,cut,color,clarity
0,69.0,0.000000,2.0,0.822093
1,69.0,0.000000,2.0,0.667442
2,69.0,0.000000,2.0,0.463953
3,69.0,0.000000,2.0,1.066279
4,69.0,0.000000,2.0,0.301163
...,...,...,...,...
40450,92.0,0.331395,4.0,0.276744
40451,92.0,0.331395,4.0,0.268605
40452,92.0,0.331395,4.0,0.081395
40453,92.0,0.331395,4.0,0.048837


In [22]:
diamond_test_scaler_df

Unnamed: 0,carat,cut,color,clarity
0,46.0,0.331395,1.0,0.429314
1,92.0,0.000000,2.0,0.727651
2,69.0,0.168605,1.0,0.996881
3,46.0,0.331395,1.0,0.509356
4,46.0,0.331395,2.0,0.218295
...,...,...,...,...
13480,92.0,0.418605,1.0,0.269231
13481,92.0,0.081395,2.0,0.371102
13482,92.0,0.331395,2.0,0.363825
13483,46.0,0.331395,1.0,0.363825


In [23]:
diamond_scaler_df.corr()

Unnamed: 0,carat,cut,color,clarity
carat,1.0,0.024307,0.194371,-0.136154
cut,0.024307,1.0,-0.00771,-0.294065
color,0.194371,-0.00771,1.0,-0.33353
clarity,-0.136154,-0.294065,-0.33353,1.0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(diamond_scaler_df,diamonds_original_withoutcity['price'], test_size=0.2, random_state=42)

In [25]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [26]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
mae = mean_absolute_error(y_test,y_pred)
print("mae: %f" %(mae))
Rsquare=regressor.score(X_test,y_test)
print("Rsquare: %f" %(Rsquare))
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
print("rmse: %f" %(rmse))

mae: 392.864370
Rsquare: 0.964023
rmse: 770.189149


In [27]:
from sklearn.ensemble import RandomForestRegressor
regressor1 = RandomForestRegressor(n_estimators = 100, max_depth=2, random_state = 0)
regressor1.fit(X_train, y_train)
y_pred1 = regressor1.predict(X_test)

In [28]:
mae = mean_absolute_error(y_test,y_pred1)
print("mae: %f" %(mae))
Rsquare=regressor.score(X_test,y_test)
print("Rsquare: %f" %(Rsquare))
rmse=np.sqrt(mean_squared_error(y_test,y_pred1))
print("rmse: %f" %(rmse))

mae: 1048.109159
Rsquare: 0.964023
rmse: 1683.544605


In [29]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(diamond_scaler_df, diamonds_original_withoutcity['price'])
y_pred = regressor.predict(diamond_test_scaler_df)

In [31]:
submisions = pd.DataFrame({"id": diamonds_test_withoutcity["id"], "price": y_pred})
submisions.to_csv("db/submision1.csv", index=False)