In [45]:
import pandas as pd
import numpy as np
import math
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn. preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV

In [31]:
diamonds = pd.read_csv('diamonds_train.csv')
predict = pd.read_csv('diamonds_predict.csv')

In [32]:
# Transformation which relates x, y and z with diamond volume: CONE

diameter = (diamonds['x'] + diamonds['y']) / 2
diamonds['volume cone'] = (1/3) * np.pi * ((diameter / 2) ** 2) * diamonds['z']
diamonds['volume semisphere'] = (4/6) * np.pi * ((diameter / 2)) **2
diamonds['volume'] = diamonds['volume cone'] + diamonds['volume semisphere']

In [33]:
diameter_predict = (predict['x'] + predict['y']) / 2
predict['volume cone'] = (1/3) * np.pi * ((diameter_predict / 2) ** 2) * predict['z']
predict['volume semisphere'] = (4/6) * np.pi * ((diameter_predict / 2)) **2
predict['volume'] = predict['volume cone'] + predict['volume semisphere']

In [34]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume cone,volume semisphere,volume
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,51.600247,24.282469,75.882716
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,13.717337,9.976245,23.693583
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,29.699643,16.273777,45.973421
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,17.349445,11.566297,28.915742
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,44.095278,22.326723,66.422001


In [35]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z,volume cone,volume semisphere,volume
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.797706,61.752841,57.446133,3928.444469,5.729392,5.732819,3.537154,34.01666,17.864177,51.880837
std,0.475544,1.431725,2.233535,3992.416147,1.124453,1.14665,0.697062,23.194523,7.560415,30.662836
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,945.0,4.71,4.72,2.91,16.988364,11.640242,28.641764
50%,0.7,61.8,57.0,2397.0,5.69,5.71,3.52,30.025693,17.011724,47.048583
75%,1.04,62.5,59.0,5331.0,6.54,6.54,4.035,44.727684,22.395157,67.04426
max,4.5,79.0,95.0,18823.0,10.23,58.9,8.06,2367.356334,587.433333,2954.789667


In [36]:
# The first step is defining the Data Cleaning and Feature Engineering 

target = 'price'
cat_features = ['cut', 'color', 'clarity']
num_features = ['carat', 'depth', 'table', 'x', 'y', 'z', 'volume']
features = cat_features + num_features

## **Kneighbors Regressor**

In [37]:
# Generamos una primera etapa del pipeline que transforma los datos categoricos y los datos numéricos del modelo

# En los datos numéricos fill los nulos con la mediana y estandarizamos los resultados
numeric_transformer = \
Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')), ('scalar', StandardScaler())])

# En los datos categóricos fill los nulos con una constante y los transformamos con OneHotEncoding
categorical_transformer = \
Pipeline(steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missin')), 
                  ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

In [38]:
# Generamos una segunda etapa donde definimos las variables numericas a las que aplicar numeric_transformer y las variables categoricas a las que aplicar categorical_transformer

preprocessor = \
ColumnTransformer(transformers = [('num', numeric_transformer, num_features), ('cat', categorical_transformer, cat_features)])

In [39]:
# Generamos una última etapa donde definimos el modelo

model = \
Pipeline(steps = [('preprocessor', preprocessor), ('regressor', KNeighborsRegressor())])

In [40]:
# Dividimos el dataset en train y en test

diamonds_train, diamonds_test = train_test_split(diamonds)

In [41]:
# Entrenamos el modelo

model.fit(diamonds_train[features], diamonds_train[target])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z', 'volume']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missin',
                                                                                 strategy='constant')),
                                                                  ('

In [42]:
# Validamos el modelo con diamonds_test

y_test_pred = model.predict(diamonds_test[features])
y_test_real = diamonds_test[target]

mean_squared_error(y_test_real, y_test_pred, squared = False)

819.7226486199285

In [43]:
# Validamos el modelo con diamonds_train

y_train_pred = model.predict(diamonds_train[features])
y_train_real = diamonds_train[target]

mean_squared_error(y_train_real, y_train_pred, squared = False)

643.4374378691887

In [15]:
y_pred = model.predict(predict).clip(300, 18000)



In [16]:
y_pred

array([3554.04735219, 6458.74119971, 9845.5379855 , ..., 3751.41557984,
       1841.3744534 ,  677.2536979 ])

## **Hyperparameter Optimization**

In [None]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_neighbors': [2, 5, 8, 10, 13, 15],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds[features], diamonds[target])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


## **Submission**

In [17]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [18]:
submission_df.to_csv('./data/LinearRegression.csv', index = False)