In [1]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
diamond_df = pd.read_csv("diamonds_train.csv", index_col=0)

In [3]:
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,Ideal,H,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,Very Good,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,0.42,Premium,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,Ideal,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,1.1,Good,G,SI1,63.4,57.0,6.52,6.55,4.14,4997


In [41]:
diamond_df.cut.unique()

array(['Ideal', 'Very Good', 'Premium', 'Good', 'Fair'], dtype=object)

In [42]:
diamond_df.color.unique()

array(['H', 'D', 'F', 'G', 'I', 'E', 'J'], dtype=object)

In [43]:
diamond_df.clarity.unique()

array(['VS2', 'VVS2', 'VS1', 'IF', 'SI1', 'SI2', 'VVS1', 'I1'],
      dtype=object)

In [4]:
class DiamondEncoder:

    @staticmethod
    def classify_cut(x):
        if x == 'Ideal':
            return 10
        elif x == 'Premium':
            return 9
        elif x == 'Very Good':
            return 8
        elif x == 'Good':
            return 6.5
        else:
            return 4

    @staticmethod
    def classify_color(x):
        if x == 'D':
            return 10
        elif x == 'E':
            return 9.5
        elif x == 'F':
            return 9
        elif x == 'G':
            return 8.5
        elif x == 'H':
            return 8
        elif x == 'I':
            return 7.5
        else:
            return 7

    @staticmethod
    def classify_clarity(x):
        if x == 'IF':
            return 10
        elif x == 'VVS1':
            return 9
        elif x == 'VVS2':
            return 8
        elif x == 'VS1':
            return 7
        elif x == 'VS2':
            return 6
        elif x == 'SI1':
            return 5
        elif x == 'SI2':
            return 3.5
        else:
            return 2.5

In [22]:
df = diamond_df.copy()
df.cut = df.cut.apply(DiamondEncoder.classify_cut)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,10.0,H,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,8.0,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,0.42,9.0,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,10.0,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,1.1,6.5,G,SI1,63.4,57.0,6.52,6.55,4.14,4997


In [23]:
df.color = df.color.apply(DiamondEncoder.classify_color)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,10.0,8.0,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,8.0,10.0,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,0.42,9.0,9.0,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,10.0,8.0,IF,61.1,57.0,4.16,4.12,2.53,600
4,1.1,6.5,8.5,SI1,63.4,57.0,6.52,6.55,4.14,4997


In [24]:
df.clarity = df.clarity.apply(DiamondEncoder.classify_clarity)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,10.0,8.0,6.0,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,8.0,10.0,8.0,64.0,56.0,4.14,4.17,2.66,532
2,0.42,9.0,9.0,7.0,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,10.0,8.0,10.0,61.1,57.0,4.16,4.12,2.53,600
4,1.1,6.5,8.5,5.0,63.4,57.0,6.52,6.55,4.14,4997


In [25]:
df.dtypes

carat      float64
cut        float64
color      float64
clarity    float64
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [26]:
X = np.array(df.loc[:, :'z'])
y = np.array(df.price)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [28]:
lm = LinearRegression()

In [29]:
lm.fit(X_train, y_train)
lm.score(X_train, y_train)

0.9108546712559104

In [30]:
lm.score(X_test, y_test)

0.9130776314782224

In [31]:
def get_RMSE(model, X, y, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
    y_pred = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    y_pred_test = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    print('Train RMSE:', train_rmse)
    print('Test_RMSE:', test_rmse)

get_RMSE(lm, X, y)

Train RMSE: 1188.591931255598
Test_RMSE: 1175.1838902646157


In [32]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [15]:
degree = 2
poly_reg = Pipeline(steps=[
    ('features', PolynomialFeatures(degree=degree)),
    ('linreg', LinearRegression())
])

poly_params = {'features__degree':[2, 3, 4, 5, 6, 7, 8, 9, 10]}

gs_poly = GridSearchCV(poly_reg,
                         poly_params,
                         cv = 10,
                         scoring = 'r2',
                         verbose=100,
                         n_jobs=-1,
                         error_score='raise')

In [74]:
gs_poly.fit(X_train, y_train)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


GridSearchCV(cv=10, error_score='raise',
             estimator=Pipeline(steps=[('features', PolynomialFeatures()),
                                       ('linreg', LinearRegression())]),
             n_jobs=-1,
             param_grid={'features__degree': [2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='r2', verbose=100)

In [16]:
gs_poly.best_estimator_.score(X_train, y_train)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [80]:
get_RMSE(gs_poly.best_estimator_, X, y)

Train RMSE: 1430.9911243498082
Test_RMSE: 1605.141958404413


In [17]:
poly_reg = Pipeline(steps=[
    ('features', PolynomialFeatures(degree=2)),
    ('linreg', LinearRegression())
])

poly_reg.fit(X_train, y_train)
poly_reg.score(X_train, y_train)

0.964257375496373

In [18]:
get_RMSE(poly_reg, X, y)

Train RMSE: 752.621640440491
Test_RMSE: 765.1168507012866


In [24]:
poly_reg.fit(X, y)

Pipeline(steps=[('features', PolynomialFeatures()),
                ('linreg', LinearRegression())])

In [33]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    sample = pd.read_csv("sample_submission.csv")
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                df_to_submit.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

In [34]:
filepath = 'diamonds_test.csv'

def prepare_test(model, filepath):
    df = pd.read_csv(filepath, index_col=0)
    df.cut = df.cut.apply(DiamondEncoder.classify_cut)
    df.color = df.color.apply(DiamondEncoder.classify_color)
    df.clarity = df.clarity.apply(DiamondEncoder.classify_clarity)
    X = df.loc[:, :'z']

    predictions_submit = model.predict(X)
    submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})
    return submission

In [30]:
submission = prepare_test(poly_reg, filepath)
chequeator(submission)

You're ready to submit!


In [35]:
import pickle
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

svr_model = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

svr_params = {
    'svr__kernel':['rbf','linear','poly', 'sigmoid'],
    'svr__C':[0.001, 0.1, 0.5, 1, 5, 10],
    'svr__gamma':('scale', 'auto')
}

gs_svr = GridSearchCV(svr_model,
                         svr_params,
                         cv = 10,
                         scoring = 'r2',
                         verbose=100,
                         n_jobs=-1,
                         error_score='raise')

In [18]:
gs_svr.fit(X_train, y_train)

filename = 'svr_model.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(gs_svr, archivo_salida)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [20]:
gs_svr.best_estimator_.score(X_train, y_train)

0.8892195037995919

In [21]:
best_svr = gs_svr.best_estimator_
get_RMSE(best_svr, X, y)

Train RMSE: 1324.9977333320358
Test_RMSE: 1310.472448793934


In [22]:
len(df)

40345

In [24]:
reduced_df = df.loc[:999]
reduced_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,10.0,8.0,6.0,63.0,57.0,6.73,6.70,4.23,6134
1,0.28,8.0,10.0,8.0,64.0,56.0,4.14,4.17,2.66,532
2,0.42,9.0,9.0,7.0,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,10.0,8.0,10.0,61.1,57.0,4.16,4.12,2.53,600
4,1.10,6.5,8.5,5.0,63.4,57.0,6.52,6.55,4.14,4997
...,...,...,...,...,...,...,...,...,...,...
995,1.50,4.0,9.0,6.0,65.3,57.0,7.15,7.12,4.66,13853
996,0.50,8.0,9.0,5.0,62.7,61.0,5.01,5.07,3.16,1197
997,1.20,10.0,8.5,6.0,61.9,57.0,6.86,6.83,4.24,7930
998,0.72,8.0,8.5,5.0,63.1,57.0,5.71,5.67,3.59,2385


In [36]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoost

pipe = Pipeline(steps=[
    ('model', RandomForestRegressor())
])

forest_params = {'model': [RandomForestRegressor()]}
xg_params = {'model': [xgb.XGBClassifier()]}
cat_params = {'model': [CatBoost()]}

search_space = [forest_params, xg_params, cat_params]

gs_top = GridSearchCV(pipe,
                         search_space,
                         cv = 10,
                         scoring = 'r2',
                         verbose=100,
                         n_jobs=-1,
                         error_score='raise')

In [33]:
reduced_df = df.loc[:999]
Xr = np.array(reduced_df.loc[:, :'z'])
yr = np.array(reduced_df.price)

X_trainR, X_testR, y_trainR, y_testR = train_test_split(Xr, yr, test_size=0.2, random_state=42)

In [42]:
gs_top.fit(X_trainR, y_trainR)

filename = 'top_models.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(gs_top, archivo_salida)

In [35]:
gs_top.best_estimator_

Pipeline(steps=[('model',
                 <catboost.core.CatBoost object at 0x000001BA1492CF48>)])

In [37]:
cat_model = gs_top.best_estimator_

In [38]:
get_RMSE(cat_model, X, y)

Train RMSE: 723.4527377062775
Test_RMSE: 740.1343200043981


In [39]:
cat = CatBoost()
cat.fit(X_train, y_train)

aining: 1.88s
629:	learn: 451.4336937	total: 3.2s	remaining: 1.88s
630:	learn: 451.3912767	total: 3.2s	remaining: 1.87s
631:	learn: 451.3904137	total: 3.21s	remaining: 1.87s
632:	learn: 451.3763739	total: 3.21s	remaining: 1.86s
633:	learn: 451.2771775	total: 3.21s	remaining: 1.85s
634:	learn: 451.1745167	total: 3.22s	remaining: 1.85s
635:	learn: 451.0535349	total: 3.22s	remaining: 1.84s
636:	learn: 450.9399898	total: 3.23s	remaining: 1.84s
637:	learn: 450.8316229	total: 3.23s	remaining: 1.83s
638:	learn: 450.7874885	total: 3.24s	remaining: 1.83s
639:	learn: 450.5973186	total: 3.24s	remaining: 1.82s
640:	learn: 450.4768592	total: 3.25s	remaining: 1.82s
641:	learn: 450.3280631	total: 3.25s	remaining: 1.81s
642:	learn: 450.2118002	total: 3.25s	remaining: 1.81s
643:	learn: 450.1076077	total: 3.26s	remaining: 1.8s
644:	learn: 450.0428544	total: 3.26s	remaining: 1.8s
645:	learn: 449.9551632	total: 3.27s	remaining: 1.79s
646:	learn: 449.8426866	total: 3.27s	remaining: 1.78s
647:	learn: 449.77

<catboost.core.CatBoost at 0x1ba15242bc8>

In [41]:
get_RMSE(cat, X_train, y_train)

Train RMSE: 420.9127594108012
Test_RMSE: 412.4890222273193


In [43]:
filename = 'cat_base.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(cat, archivo_salida)

In [44]:
cat.fit(X, y)
submission = prepare_test(cat, filepath)
chequeator(submission)

9:	learn: 455.4187519	total: 3.52s	remaining: 2.07s
630:	learn: 455.2628827	total: 3.52s	remaining: 2.06s
631:	learn: 455.1407324	total: 3.53s	remaining: 2.06s
632:	learn: 455.0490535	total: 3.53s	remaining: 2.05s
633:	learn: 454.9048733	total: 3.54s	remaining: 2.04s
634:	learn: 454.7466855	total: 3.54s	remaining: 2.04s
635:	learn: 454.6272933	total: 3.55s	remaining: 2.03s
636:	learn: 454.5247510	total: 3.56s	remaining: 2.03s
637:	learn: 454.4685284	total: 3.56s	remaining: 2.02s
638:	learn: 454.3012185	total: 3.57s	remaining: 2.01s
639:	learn: 454.2298746	total: 3.57s	remaining: 2.01s
640:	learn: 454.1529068	total: 3.58s	remaining: 2s
641:	learn: 454.0820214	total: 3.58s	remaining: 2s
642:	learn: 454.0160184	total: 3.59s	remaining: 1.99s
643:	learn: 453.9521076	total: 3.59s	remaining: 1.99s
644:	learn: 453.8630282	total: 3.6s	remaining: 1.98s
645:	learn: 453.7764480	total: 3.6s	remaining: 1.98s
646:	learn: 453.7234203	total: 3.61s	remaining: 1.97s
647:	learn: 453.5861813	total: 3.62s	r

In [47]:
from catboost import CatBoost
from sklearn.metrics import make_scorer, accuracy_score

model = CatBoost()
scorer = make_scorer(accuracy_score)

cat_params = {'depth':[3,1,2,6,4,5,7,8,9,10],
              'iterations':[250,100,500,1000],
              'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
              'l2_leaf_reg':[3,1,5,10,100],
              'border_count':[32,5,10,20,50,100,200],
              'bagging_temperature':[0.03,0.09,0.25,0.75],
              'random_strength':[0.2,0.5,0.8],
              'max_ctr_complexity':[1,2,3,4,5],
              'logging_level': ['Silent'],
              'random_seed': [42] }

gs_cat = GridSearchCV(model,
                         cat_params,
                         cv = 10,
                         scoring = scorer,
                         verbose=1,
                         n_jobs=-1)

In [48]:
gs_cat.fit(X_train, y_train)
filename = 'cats.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(cat, archivo_salida)

Fitting 10 folds for each of 504000 candidates, totalling 5040000 fits


KeyboardInterrupt: 

In [37]:
params = {
        'depth': 3,
        'logging_level': 'Silent'
        }
bobcat = CatBoost(params=params)
bobcat.fit(X_train, y_train)
get_RMSE(bobcat, X_train, y_train)

In [None]:
get_RMSE(bobcat, X_train, y_train)