In [61]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error

### Defino X,y (TRAIN):

In [2]:
train2=pd.read_csv("../data/train2.csv")

In [3]:
train2.sample()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,price,cut_num,color_num,clarity_num
37171,37171,37171,0.7,62.2,57.0,4458,4,5,2


In [4]:
X=train2.drop(columns=["id","price","Unnamed: 0"])

In [5]:
y=train2.price

### Cargo X (TEST):

In [6]:
test2=pd.read_csv("../data/test2.csv")

In [7]:
test2.sample()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,cut_num,color_num,clarity_num
13182,13182,13182,1.11,61.7,58.0,5,3,5


In [8]:
test_=test2.drop(columns=["id","Unnamed: 0"])

In [9]:
test_.sample()

Unnamed: 0,carat,depth,table,cut_num,color_num,clarity_num
5334,1.51,60.8,66.0,1,4,4


### Train, test, split:

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
X_train.shape

(32364, 6)

In [13]:
X_test.shape

(8091, 6)

In [14]:
y_train.shape

(32364,)

In [15]:
y_test.shape

(8091,)

### Modelos de Regresión (variables cuantitativas):

In [16]:
models={'lin':LinearRegression(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
    'elas': ElasticNet(),
    'svr' :LinearSVR(),
    'tree': DecisionTreeRegressor(),
    'random': RandomForestRegressor()
        }

In [17]:
for name, model in models.items():
    print("ENTRENANDO: ", name)
    model.fit(X_train, y_train)

ENTRENANDO:  lin
ENTRENANDO:  ridge
ENTRENANDO:  lasso
ENTRENANDO:  sgd
ENTRENANDO:  knn
ENTRENANDO:  grad
ENTRENANDO:  elas
ENTRENANDO:  svr
ENTRENANDO:  tree
ENTRENANDO:  random


In [18]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

------lin------
RMSE -  1212.8179828577904
------ridge------
RMSE -  1212.8138053307223
------lasso------
RMSE -  1212.790532377083
------sgd------
RMSE -  90339331.09047014
------knn------
RMSE -  1870.2794940134886
------grad------
RMSE -  623.495259563058
------elas------
RMSE -  2932.7134067910974
------svr------
RMSE -  1868.0627781424344
------tree------
RMSE -  736.9784922311752
------random------
RMSE -  540.0838678111652


### Me quedo solo con los mejores modelos, es decir, con aquellos cuyo RMSE sea menor  (grad, tree, random) y los entreno de nuevo y predigo con todo "train".

In [19]:
best_models={
    'grad': GradientBoostingRegressor(),
    'tree': DecisionTreeRegressor(),
    'random': RandomForestRegressor()
        }

In [20]:
for name, model in best_models.items():
    print("ENTRENANDO: ", name)
    model.fit(X, y)

ENTRENANDO:  grad
ENTRENANDO:  tree
ENTRENANDO:  random


In [21]:
for name, model in best_models.items():
    y_pred_X = model.predict(X)
    print(f"------{name}------")
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y, y_pred_X)))

------grad------
RMSE -  610.7211689205226
------tree------
RMSE -  55.99749897463047
------random------
RMSE -  209.18911832504435


###  Ahora me quedo únicamente con el modelo "tree" y hago la predicción con "test":

In [45]:
tree=best_models["tree"]

In [46]:
y_pred_test=tree.predict(test_)

In [47]:
test2["price"]=y_pred_test

In [27]:
submission1=test2.drop(columns=["Unnamed: 0","carat","depth","table","cut_num","color_num","clarity_num"])

In [28]:
submission1.sample()

Unnamed: 0,id,price
1823,1823,1141.0


In [29]:
submission1.to_csv("../data/myfirstsubmissionf.csv", index=False)

### Intento mejorar mi resultado con los súper parámetros:

In [30]:
paremeters_tree = {'ccp_alpha': [0.0, 0.1, 0.2],
                   'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"], 
                   'max_depth': [0,1,2,3,4,5,None], 
                   'max_features': ["auto", "sqrt", "log2", None],
                   'max_leaf_nodes': [0,1,2,3,4,5,None], 
                   'min_impurity_decrease': [0.0, 0.1, 0.2],
                   'min_samples_leaf': [1,2,3,4,5], 
                   'min_samples_split': [1,2,3,4,5],
                   'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
                   'splitter': ['best', 'random']
                  }

In [31]:
paremeters_tree_red = {'ccp_alpha': [0.0, 0.1, 0.2],
                        'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"], 
                        'max_depth': [0,1,3,5,None], 
                  }

In [32]:
tree=best_models["tree"]

In [42]:
grid_tree = GridSearchCV(tree, paremeters_tree_red, verbose=1)

In [34]:
grid_tree.fit(X_train,y_train) #Hago Cross Validation a la par que lo entreno con todas las combinaciones para luego obetener sus mejores parámetros.

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'ccp_alpha': [0.0, 0.1, 0.2],
                         'criterion': ['squared_error', 'friedman_mse',
                                       'absolute_error', 'poisson'],
                         'max_depth': [0, 1, 3, 5, None]},
             verbose=1)

In [35]:
grid_tree.best_params_ #obtengo los mejores parámetros de este modelo.

{'ccp_alpha': 0.2, 'criterion': 'absolute_error', 'max_depth': None}

In [36]:
best_tree=DecisionTreeRegressor(ccp_alpha= 0.2, criterion= 'absolute_error', max_depth= None, splitter='best')

In [37]:
best_tree.fit(X,y)

DecisionTreeRegressor(ccp_alpha=0.2, criterion='absolute_error')

In [50]:
y_pred_best=best_tree.predict(test_)

In [55]:
test2["price"]=y_pred_best

In [56]:
test2.sample()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,cut_num,color_num,clarity_num,price,y_pred_best
2809,2809,2809,0.91,64.0,61.0,2,6,6,4304.0,4304.0


In [57]:
submission2=test2.drop(columns=["Unnamed: 0","carat","depth","table","cut_num","color_num","clarity_num","y_pred_best"])

In [59]:
submission2.sample()

Unnamed: 0,id,price
3890,3890,4544.0
