In [1]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
diamond_df = pd.read_csv("diamonds_train.csv", index_col=0)
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,Ideal,H,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,Very Good,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,0.42,Premium,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,Ideal,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,1.1,Good,G,SI1,63.4,57.0,6.52,6.55,4.14,4997


In [3]:
class DiamondEncoder:

    @staticmethod
    def classify_cut(x):
        if x == 'Ideal':
            return 10
        elif x == 'Premium':
            return 9
        elif x == 'Very Good':
            return 8
        elif x == 'Good':
            return 6.5
        else:
            return 4

    @staticmethod
    def classify_color(x):
        if x == 'D':
            return 10
        elif x == 'E':
            return 9.5
        elif x == 'F':
            return 9
        elif x == 'G':
            return 8.5
        elif x == 'H':
            return 8
        elif x == 'I':
            return 7.5
        else:
            return 7

    @staticmethod
    def classify_clarity(x):
        if x == 'IF':
            return 10
        elif x == 'VVS1':
            return 9
        elif x == 'VVS2':
            return 8
        elif x == 'VS1':
            return 7
        elif x == 'VS2':
            return 6
        elif x == 'SI1':
            return 5
        elif x == 'SI2':
            return 3.5
        else:
            return 2.5

In [4]:
df = diamond_df.copy()
df.cut = df.cut.apply(DiamondEncoder.classify_cut)
df.color = df.color.apply(DiamondEncoder.classify_color)
df.clarity = df.clarity.apply(DiamondEncoder.classify_clarity)

X = np.array(df.loc[:, :'z'])
y = np.array(df.price)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [5]:
def get_RMSE(model, X, y, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
    y_pred = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    y_pred_test = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    print('Train RMSE:', train_rmse)
    print('Test_RMSE:', test_rmse)

def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    sample = pd.read_csv("sample_submission.csv")
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                df_to_submit.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

filepath = 'diamonds_test.csv'

def prepare_test(model, filepath):
    df = pd.read_csv(filepath, index_col=0)
    df.cut = df.cut.apply(DiamondEncoder.classify_cut)
    df.color = df.color.apply(DiamondEncoder.classify_color)
    df.clarity = df.clarity.apply(DiamondEncoder.classify_clarity)
    X = df.loc[:, :'z']

    predictions_submit = model.predict(X)
    submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})
    
    return submission

In [10]:
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

wildcat = CatBoostRegressor(depth=6, learning_rate=0.1, l2_leaf_reg=5, iterations=1200, logging_level='Silent')
graduate = GradientBoostingRegressor(n_estimators=100, criterion='mse', min_samples_split=100, min_samples_leaf=5, max_depth=6)
forest_gump = RandomForestRegressor(n_estimators=100, min_samples_split=10, random_state=42, min_samples_leaf=2, max_depth=20)

estimators = [('cat', wildcat), ('rand_forest', forest_gump), ('grad', graduate)]

voting = VotingRegressor(estimators=estimators)
voting.fit(X_train, y_train)
get_RMSE(voting, X, y)

Train RMSE: 388.05267180234176
Test_RMSE: 521.0310978701873


In [7]:
filename = 'voting_cat_grad_forest.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(voting, archivo_salida)

In [9]:
filepath = 'diamonds_test.csv'
voting.fit(X, y)
submission = prepare_test(voting, filepath)
chequeator(submission)

You're ready to submit!
