In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

### Defino X,y (TRAIN):

In [2]:
train2=pd.read_csv("../data/train2.csv")

In [3]:
train2.sample()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,price,cut_num,color_num,clarity_num
25807,25807,25807,2.04,60.9,57.0,14527,3,2,7


In [4]:
X=train2.drop(columns=["id","price","Unnamed: 0"])

In [5]:
y=train2.price

### Cargo X (TEST):

In [6]:
test2=pd.read_csv("../data/test2.csv")

In [7]:
test2.sample()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,cut_num,color_num,clarity_num
2471,2471,2471,0.41,61.6,57.0,4,4,4


In [8]:
test_=test2.drop(columns=["id","Unnamed: 0"])

In [9]:
test_.sample()

Unnamed: 0,carat,depth,table,cut_num,color_num,clarity_num
4043,0.31,60.7,59.0,5,7,5


### Train, test, split:

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
X_train.shape

(32364, 6)

In [13]:
X_test.shape

(8091, 6)

In [14]:
y_train.shape

(32364,)

In [15]:
y_test.shape

(8091,)

### Modelos de Regresión (variables cuantitativas):

In [16]:
models={'lin':LinearRegression(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
    'elas': ElasticNet(),
    'svr' :LinearSVR(),
    'tree': DecisionTreeRegressor(),
    'random': RandomForestRegressor()
        }

In [17]:
for name, model in models.items():
    print("ENTRENANDO: ", name)
    model.fit(X_train, y_train)

ENTRENANDO:  lin
ENTRENANDO:  ridge
ENTRENANDO:  lasso
ENTRENANDO:  sgd
ENTRENANDO:  knn
ENTRENANDO:  grad
ENTRENANDO:  elas
ENTRENANDO:  svr
ENTRENANDO:  tree
ENTRENANDO:  random


In [18]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

------lin------
RMSE -  1219.003617811825
------ridge------
RMSE -  1218.9872996393444
------lasso------
RMSE -  1218.929337162761
------sgd------
RMSE -  5342618.683750766
------knn------
RMSE -  1927.578129993123
------grad------
RMSE -  614.0703449914851
------elas------
RMSE -  2934.5413659331334
------svr------
RMSE -  1868.7548519191039
------tree------
RMSE -  733.5484455036707
------random------
RMSE -  548.1042944986865


### Me quedo solo con los mejores modelos, es decir, con aquellos cuyo RMSE sea menor  (grad, tree, random) y los entreno de nuevo y predigo con todo "train".

In [22]:
best_models={
    'grad': GradientBoostingRegressor(),
    'tree': DecisionTreeRegressor(),
    'random': RandomForestRegressor()
        }

In [23]:
for name, model in best_models.items():
    print("ENTRENANDO: ", name)
    model.fit(X, y)

ENTRENANDO:  grad
ENTRENANDO:  tree
ENTRENANDO:  random


In [24]:
for name, model in best_models.items():
    y_pred_X = model.predict(X)
    print(f"------{name}------")
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y, y_pred_X)))

------grad------
RMSE -  610.7211689205222
------tree------
RMSE -  55.99749897463047
------random------
RMSE -  209.6784396489439


###  Ahora me quedo únicamente con el modelo "tree" y hago la predicción con "test":

In [37]:
tree=best_models["tree"]

In [38]:
y_pred_test=tree.predict(test_)

In [39]:
test2["price"]=y_pred_test

In [42]:
submission1=test2.drop(columns=["Unnamed: 0","carat","depth","table","cut_num","color_num","clarity_num","y_pred"])

In [43]:
submission1.sample()

Unnamed: 0,id,price
39,39,923.0


In [44]:
submission1.to_csv("../data/myfirstsubmissionf.csv", index=False)

### Intento mejorar mi resultado con los súper parámetros:

In [49]:
paremeters_tree = {'ccp_alpha': [0.0, 0.1, 0.2],
                   'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"], 
                   'max_depth': [0,1,2,3,4,5,None], 
                   'max_features': ["auto", "sqrt", "log2", None],
                   'max_leaf_nodes': [0,1,2,3,4,5,None], 
                   'min_impurity_decrease': [0.0, 0.1, 0.2],
                   'min_samples_leaf': [1,2,3,4,5], 
                   'min_samples_split': [1,2,3,4,5],
                   'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
                   'splitter': ['best', 'random']
                  }

In [51]:
tree=best_models["tree"]

In [52]:
grid_tree = GridSearchCV(tree, paremeters_tree, verbose=1)

In [None]:
grid_tree.fit(X_train,y_train)

Fitting 5 folds for each of 1058400 candidates, totalling 5292000 fits
