In [1]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoost

In [113]:
diamond_df = pd.read_csv("diamonds_train.csv", index_col=0)
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,Ideal,H,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,Very Good,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,0.42,Premium,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,Ideal,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,1.1,Good,G,SI1,63.4,57.0,6.52,6.55,4.14,4997


In [127]:
class DiamondEncoder:

    @staticmethod
    def classify_cut(x):
        if x == 'Ideal':
            return 10
        elif x == 'Premium':
            return 9
        elif x == 'Very Good':
            return 8
        elif x == 'Good':
            return 6.5
        else:
            return 4

    @staticmethod
    def classify_color(x):
        if x == 'D':
            return 10
        elif x == 'E':
            return 9.5
        elif x == 'F':
            return 9
        elif x == 'G':
            return 8.5
        elif x == 'H':
            return 8
        elif x == 'I':
            return 7.5
        else:
            return 7

    @staticmethod
    def classify_clarity(x):
        if x == 'IF':
            return 10
        elif x == 'VVS1':
            return 9
        elif x == 'VVS2':
            return 8
        elif x == 'VS1':
            return 7
        elif x == 'VS2':
            return 6
        elif x == 'SI1':
            return 5
        elif x == 'SI2':
            return 3.5
        else:
            return 2.5

In [237]:
df = diamond_df.copy()
df.cut = df.cut.apply(DiamondEncoder.classify_cut)
df.color = df.color.apply(DiamondEncoder.classify_color)
df.clarity = df.clarity.apply(DiamondEncoder.classify_clarity)

X = np.array(df.loc[:, :'z'])
y = np.array(df.price)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [5]:
def get_RMSE(model, X, y, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
    y_pred = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    y_pred_test = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    print('Train RMSE:', train_rmse)
    print('Test_RMSE:', test_rmse)

def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    sample = pd.read_csv("sample_submission.csv")
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                df_to_submit.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

filepath = 'diamonds_test.csv'

def prepare_test(model, filepath):
    df = pd.read_csv(filepath, index_col=0)
    df.cut = df.cut.apply(DiamondEncoder.classify_cut)
    df.color = df.color.apply(DiamondEncoder.classify_color)
    df.clarity = df.clarity.apply(DiamondEncoder.classify_clarity)
    X = df.loc[:, :'z']

    predictions_submit = model.predict(X)
    submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})
    
    return submission

In [6]:
cat_params = {'depth':[3,1,2,6,4,5,7,8,9,10],
              'iterations':[250,100,500,1000],
              'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
              'l2_leaf_reg':[3,1,5,10,100],
              'border_count':[32,5,10,20,50,100,200],
              'bagging_temperature':[0.03,0.09,0.25,0.75],
              'random_strength':[0.2,0.5,0.8],
              'max_ctr_complexity':[1,2,3,4,5],
              'logging_level': ['Silent'],
              'random_seed': [42] }

In [172]:
params = {
        'depth': 6,
        'learning_rate': 0.1,
        'l2_leaf_reg': 5,
        'iterations': 1200,
        'logging_level': 'Silent'
        }
bobcat = CatBoost(params=params)
bobcat.fit(X_train, y_train)
get_RMSE(bobcat, X, y)

Train RMSE: 398.1334721897635
Test_RMSE: 527.7198351056487


In [132]:
wildcat = CatBoost(params=params)
wildcat.fit(X_train2, y_train2)
get_RMSE(wildcat, X2, y)

Train RMSE: 474.25206899680916
Test_RMSE: 535.5276824449721


In [267]:
tiger = CatBoostRegressor(depth=6, learning_rate=0.1, l2_leaf_reg=5, iterations=1200, logging_level='Silent')
tiger.fit(X_train, y_train)
get_RMSE(tiger, X, y)

Train RMSE: 398.1334721897635
Test_RMSE: 527.7198351056487


In [111]:
filename = 'cat_d6_l01_lr5_iter1200.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(bobcat, archivo_salida)

In [19]:
def create_catsubmission(params, X, y):
    filepath = 'diamonds_test.csv'
    cat = CatBoost(params=params)
    cat.fit(X, y)
    submission = prepare_test(cat, filepath)
    chequeator(submission)

In [107]:
create_catsubmission(params, X, y)

You're ready to submit!


In [112]:
lion = CatBoost(params=params)
lion.fit(X, y)
y_pred = lion.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(rmse)

407.55604257776224


In [173]:
from ngboost import NGBRegressor

ngb = NGBRegressor()
ngb.fit(X_train, y_train)
get_RMSE(ngb, X, y)

[iter 0] loss=9.7082 val_loss=0.0000 scale=1.0000 norm=3028.7801
[iter 100] loss=8.6330 val_loss=0.0000 scale=2.0000 norm=1514.2064
[iter 200] loss=7.8977 val_loss=0.0000 scale=2.0000 norm=911.6176
[iter 300] loss=7.4397 val_loss=0.0000 scale=1.0000 norm=380.6111
[iter 400] loss=7.2438 val_loss=0.0000 scale=1.0000 norm=356.4169
Train RMSE: 617.1586317861708
Test_RMSE: 644.1674372878247


In [236]:
from xgboost import XGBRegressor

xgb = XGBRegressor(eta=0.1, min_child_weight=10)
xgb.fit(X_train, y_train)
get_RMSE(xgb, X, y)

Train RMSE: 453.42890303547114
Test_RMSE: 526.2681919726277


In [257]:
xgb2 = XGBRegressor(eta=0.1, min_child_weight=10, max_depth=7)
xgb2.fit(X_train2, y_train2)
get_RMSE(xgb2, X2, y)

Train RMSE: 437.3916017052151
Test_RMSE: 528.4172639424986


In [261]:
len(xgb2.predict(X2))

40345

In [247]:
filename = 'xg_eta01_cweight10_NO_depthtable.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(xgb2, archivo_salida)

In [263]:
def create_xgbsubmission(X, y):
    filepath = 'diamonds_test.csv'
    xg = XGBRegressor(eta=0.1, min_child_weight=10, max_depth=7)
    xg.fit(X, y)
    submission = prepare_test_reduced(xg, filepath)
    chequeator(submission)

In [264]:
create_xgbsubmission(X2, y)

You're ready to submit!


In [256]:
X2 = df.loc[:, ['carat', 'color', 'cut','clarity', 'x', 'y', 'z']]
X2 = np.array(X2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size = 0.20, random_state = 42)

In [262]:
def prepare_test_reduced(model, filepath):
    df = pd.read_csv(filepath, index_col=0)
    df.cut = df.cut.apply(DiamondEncoder.classify_cut)
    df.color = df.color.apply(DiamondEncoder.classify_color)
    df.clarity = df.clarity.apply(DiamondEncoder.classify_clarity)
    X = df.loc[:, ['carat', 'color', 'cut','clarity', 'x', 'y', 'z']]

    predictions_submit = model.predict(X)
    submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})
    
    return submission

In [269]:
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

tomcat = CatBoostRegressor(depth=6, learning_rate=0.1, l2_leaf_reg=5, iterations=1200, logging_level='Silent')
grad = GradientBoostingRegressor(n_estimators=100, criterion='mse', min_samples_split=100, min_samples_leaf=5, max_depth=6)
rfr = RandomForestRegressor(n_estimators=100, min_samples_split=10, random_state=42, min_samples_leaf=2, max_depth=20)

estimators = [('cat', tomcat), ('rand_forest', rfr), ('grad', grad)]

voting = VotingRegressor(estimators=estimators)
voting.fit(X_train, y_train)
get_RMSE(voting, X, y)

Train RMSE: 386.17795088904495
Test_RMSE: 520.5272973616463


In [None]:
filename = 'voting_cat_grad_forest.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(voting, archivo_salida)

In [270]:
filepath = 'diamonds_test.csv'
voting.fit(X, y)
submission = prepare_test(voting, filepath)
chequeator(submission)

You're ready to submit!


In [289]:
from sklearn.ensemble import GradientBoostingRegressor

grad = GradientBoostingRegressor(n_estimators=100, criterion='mse', min_samples_split=100, min_samples_leaf=5, max_depth=6)
grad.fit(X_train, y_train)
get_RMSE(grad, X, y)

Train RMSE: 459.5824521686665
Test_RMSE: 528.8329622914896


In [295]:
estimators_full = [('cat', tomcat), ('xgb', xgb_reg), ('rand_forest', rfr), ('grad', grad)]
estimators_simple = [('cat', tomcat), ('xgb', xgb_reg)]
estimators_notree = [('cat', tomcat), ('xgb', xgb_reg), ('grad', grad)]
estimators_catgrad = [('cat', tomcat), ('grad', grad)]
estimators_noxgb = [('cat', tomcat), ('rand_forest', rfr), ('grad', grad)]

In [299]:
voting = VotingRegressor(estimators=estimators_noxgb)
voting.fit(X_train, y_train)
get_RMSE(voting, X, y)

Train RMSE: 388.0526718023417
Test_RMSE: 521.0298058811242


In [300]:
filepath = 'diamonds_test.csv'
voting.fit(X, y)
submission = prepare_test(voting, filepath)
chequeator(submission)

You're ready to submit!


In [301]:
filename = 'voting_cat_grad_forest.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(voting, archivo_salida)