In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
diamonds_df = pd.read_csv('../data/diamonds_train.csv')

In [3]:
diamonds_predict = pd.read_csv('../data/diamonds_test.csv')

In [4]:
#train

diamonds_df['cut']=diamonds_df['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 'Ideal':4})
diamonds_df['color']=diamonds_df['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 'E': 5, 'D':6})
diamonds_df['clarity']=diamonds_df['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,'VVS1':6,'IF':7})

#predict

diamonds_predict['cut']=diamonds_predict['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 'Ideal':4})
diamonds_predict['color']=diamonds_predict['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 'E': 5, 'D':6})
diamonds_predict['clarity']=diamonds_predict['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,'VVS1':6,'IF':7})

In [5]:
#train
diamonds_df['cut/carat'] = diamonds_df['cut']/diamonds_df['carat']
diamonds_df['color/carat'] = diamonds_df['color']/diamonds_df['carat']
diamonds_df['clarity/carat'] = diamonds_df['clarity']/diamonds_df['carat']
#predict
diamonds_predict['cut/carat'] = diamonds_predict['cut']/diamonds_predict['carat']
diamonds_predict['color/carat'] = diamonds_predict['color']/diamonds_predict['carat']
diamonds_predict['clarity/carat'] = diamonds_predict['clarity']/diamonds_predict['carat']

In [6]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [9]:
sc = StandardScaler()
columns_df = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'cut/carat', 'color/carat', 'clarity/carat']
X = sc.fit_transform(diamonds_df[columns_df])
y = diamonds_df['price'].values

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [11]:
model = GradientBoostingRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [12]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

Mean squared error: 615.1147680665209
R Squared: 0.9756367962462131


In [13]:
model = BaggingRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [14]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

Mean squared error: 549.603650610512
R Squared: 0.9805499238099208


In [15]:
model = RandomForestRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [16]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

Mean squared error: 550.0183559950925
R Squared: 0.9805205604880018


In [17]:
import lightgbm as lgb

In [18]:
model = lgb.LGBMRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [19]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

Mean squared error: 534.7205342008929
R Squared: 0.9815890663218281


In [20]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=10, n_jobs=-1)

In [21]:
import numpy as np
np.mean(-scores)

535.2983093534878

In [22]:
from sklearn.model_selection import RandomizedSearchCV

In [25]:
FEATS = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'cut/carat', 'color/carat', 'clarity/carat']
TARGET ='price'
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds_df[FEATS], diamonds_df[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=8, regressor__n_estimators=128
[CV 1/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor__max_depth=8, regressor__n_estimators=128; total time=   0.9s
[CV 2/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=8, regressor__n_estimators=128
[CV 2/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor__max_depth=8, regressor__n_estimators=128; total time=   0.6s
[CV 3/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=8, regressor__n_estimators=128
[CV 3/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor__max_depth=8, regressor__n_estimators=128; total time=   0.5s
[CV 4/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=8, regressor__n_estimators=128
[CV 4/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor_

RandomizedSearchCV(cv=5, estimator=LGBMRegressor(), n_iter=32, n_jobs=-1,
                   param_distributions={'preprocessor__num__imputer__strategy': ['mean',
                                                                                 'median'],
                                        'regressor__max_depth': [2, 4, 8],
                                        'regressor__n_estimators': [16, 32, 64,
                                                                    128, 256,
                                                                    512]},
                   scoring='neg_root_mean_squared_error', verbose=10)

In [26]:
grid_search.best_params_

{'regressor__n_estimators': 128,
 'regressor__max_depth': 8,
 'preprocessor__num__imputer__strategy': 'mean'}

In [27]:
grid_search.best_score_

-536.7962721908281

In [28]:
y_pred = grid_search.predict(diamonds_predict[FEATS])

In [29]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [30]:
submission_df.to_csv('../data/resultados7.csv', index=False)

In [None]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()


#X_test = sc.transform(diamonds_predict[columns_df].values)
#y_hat = model.predict(X_test).clip(0, 20000)
#submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
#submission.to_csv('../data/resultados5.csv', index=False)