In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

### Defino X,y (TRAIN):

In [2]:
train2=pd.read_csv("../data/train2.csv")

In [3]:
train2.sample()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,price,cut_num,color_num,clarity_num
25807,25807,25807,2.04,60.9,57.0,14527,3,2,7


In [4]:
X=train2.drop(columns=["id","price","Unnamed: 0"])

In [5]:
y=train2.price

### Cargo X (TEST):

In [6]:
test2=pd.read_csv("../data/test2.csv")

In [7]:
test2.sample()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,cut_num,color_num,clarity_num
2471,2471,2471,0.41,61.6,57.0,4,4,4


In [8]:
test_=test2.drop(columns=["id","Unnamed: 0"])

In [9]:
test_.sample()

Unnamed: 0,carat,depth,table,cut_num,color_num,clarity_num
4043,0.31,60.7,59.0,5,7,5


### Train, test, split:

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
X_train.shape

(32364, 6)

In [13]:
X_test.shape

(8091, 6)

In [14]:
y_train.shape

(32364,)

In [15]:
y_test.shape

(8091,)

### GridSearchCV de sklearn - Modelos de Regresión (variables cuantitativas).

In [16]:
models={'lin':LinearRegression(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
    'elas': ElasticNet(),
    'svr' :LinearSVR(),
    'tree': DecisionTreeRegressor(),
    'random': RandomForestRegressor()
        }

In [17]:
for name, model in models.items():
    print("ENTRENANDO: ", name)
    model.fit(X_train, y_train)

ENTRENANDO:  lin
ENTRENANDO:  ridge
ENTRENANDO:  lasso
ENTRENANDO:  sgd
ENTRENANDO:  knn
ENTRENANDO:  grad
ENTRENANDO:  elas
ENTRENANDO:  svr
ENTRENANDO:  tree
ENTRENANDO:  random


In [18]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

------lin------
RMSE -  1219.003617811825
------ridge------
RMSE -  1218.9872996393444
------lasso------
RMSE -  1218.929337162761
------sgd------
RMSE -  5342618.683750766
------knn------
RMSE -  1927.578129993123
------grad------
RMSE -  614.0703449914851
------elas------
RMSE -  2934.5413659331334
------svr------
RMSE -  1868.7548519191039
------tree------
RMSE -  733.5484455036707
------random------
RMSE -  548.1042944986865


In [22]:
best_models={
    'grad': GradientBoostingRegressor(),
    'tree': DecisionTreeRegressor(),
    'random': RandomForestRegressor()
        }

In [23]:
for name, model in best_models.items():
    print("ENTRENANDO: ", name)
    model.fit(X, y)

ENTRENANDO:  grad
ENTRENANDO:  tree
ENTRENANDO:  random


In [24]:
for name, model in best_models.items():
    y_pred_X = model.predict(X)
    print(f"------{name}------")
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y, y_pred_X)))

------grad------
RMSE -  610.7211689205222
------tree------
RMSE -  55.99749897463047
------random------
RMSE -  209.6784396489439


### Predicción con test:

In [37]:
tree=best_models["tree"]

In [38]:
y_pred_test=tree.predict(test_)

In [39]:
test2["price"]=y_pred_test

In [42]:
submission1=test2.drop(columns=["Unnamed: 0","carat","depth","table","cut_num","color_num","clarity_num","y_pred"])

In [43]:
submission1.sample()

Unnamed: 0,id,price
39,39,923.0


In [44]:
submission1.to_csv("../data/myfirstsubmissionf.csv", index=False)

In [15]:
parameters_lin={'copy_X': [True,False],
                'fit_intercept': [True,False],
                'n_jobs': [-2,-1,1,2,None],  
                'positive': [True,False]
               }

In [16]:
parameters_ridge={'alpha': [1.0,2.0,3.0,4.0], 
                 'copy_X': [True,False], 
                 'fit_intercept': [True,False], 
                 'max_iter': [1000, 15000, None],  
                 'positive': [True,False], 
                 'random_state':[1,2,3,4,5,None] , 
                 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'], 
                 'tol': [0.001,0.002,0.003]
                 }

In [17]:
parameters_lasso={'alpha': [0.0,1.0], 
                  'copy_X': [True,False], 
                  'fit_intercept': [True,False], 
                  'max_iter': [500,1000,1500],  
                  'positive': [True,False], 
                  'precompute': [True,False], 
                  'random_state': [True,False], 
                  'selection': ['cyclic','random'], 
                  'tol': [0.0001, 0.0002],
                  'warm_start': [True,False]
                 }

In [18]:
parameters_sgd={'alpha': [0.0001,0.001,0.01], 
                'average': [True,False], 
                'early_stopping': [True,False], 
                'epsilon': [0.1,0.2,0.3], 
                'eta0': [0.1,0.2,0.3], 
                'fit_intercept': [True,False], 
                'l1_ratio': [0,0.15,0.5,0.75,1.0], 
                'learning_rate': ['invscaling','adaptive','constant','constant'], 
                'loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                'max_iter': [500,1000,2000], 
                'n_iter_no_change': [2,5,10], 
                'penalty': ['l2','l1', 'elasticnet'], 
                'power_t': [0.25, 0.5],  
                'shuffle': [True,False], 
                'tol': [0.001, None], 
                'validation_fraction': [0.0,0.1,0.5,0.75,1.0], 
                'verbose': [0,1,2], 
                'warm_start': [True,False]
               }

In [19]:
parameters_knn={'algorithm': ['auto','ball_tree', 'kd_tree', 'brute'], 
                'leaf_size': [20,30,40], 
                'metric': ['minkowski',"precomputed"],  
                'n_jobs': [-1,None], 
                'n_neighbors':[1,2,3,4,5,6,7,8,9], 
                'p': [1,2], 
                'weights': ['uniform','distance']
               }

In [20]:
parameters_grad={'alpha': [0.1, 0.5, 0.9], 
                 'ccp_alpha': [0.0,0.1, 0.5, 0.9], 
                 'criterion': ['friedman_mse', 'squared_error', 'mse', 'mae'],  
                 'learning_rate': [0.1, 0.5], 
                 'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'], 
                 'max_depth': [1,2,3,4,5], 
                 'max_features': ['auto', 'sqrt', 'log2',None], 
                 'max_leaf_nodes':[50,100,200, None], 
                 'min_impurity_decrease': [0.0,0.1,0.5,1.0], 
                 'min_samples_leaf': [1,2,3,4,5], 
                 'min_samples_split': [1,2,3,4,5], 
                 'min_weight_fraction_leaf': [0.0, 0.5],
                 'n_estimators': [100, 200,300, 500], 
                 'n_iter_no_change': [2,5,10,None],  
                 'subsample': [0.1,0.5,1.0], 
                 'tol': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
                 'validation_fraction': [0.0,0.1,0.5, 0.75,1.0], 
                 'verbose': [0,1,2],
                 'warm_start': [True,False]}

In [21]:
parameter_elast = {'alpha': [1.0, 2.0, 3.0, 4.0, 5.0], 
                   'copy_X': [True, False],
                   'fit_intercept': [True, False],
                   'l1_ratio': [0.1, 0.5, 1.0], 
                   'max_iter': [1000, 2000, 3000, 4000, 5000], 
                   'positive': [True, False],
                   'precompute': [True, False],
                   'selection': ['cyclic', 'random'], 
                   'tol': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
                   'warm_start': [True, False],
                  }

In [22]:
parameters_svr = {'C': [1.0, 2.0, 3.0, 4.0, 5.0], 
                  'dual': [True, False], 
                  'epsilon': [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
                  'fit_intercept': [True, False], 
                  'intercept_scaling': [1.0, 2.0, 3.0, 4.0, 5.0],
                  'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'], 
                  'max_iter': [1000, 2000, 3000, 4000, 5000], 
                  'tol': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
                  'verbose': [0, 1, 2]
                 }

In [23]:
paremeters_tree = {'ccp_alpha': [0.0, 0.1, 0.2],
                   'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"], 
                   'max_depth': [0,1,2,3,4,5,None], 
                   'max_features': ["auto", "sqrt", "log2", None],
                   'max_leaf_nodes': [0,1,2,3,4,5,None], 
                   'min_impurity_decrease': [0.0, 0.1, 0.2],
                   'min_samples_leaf': [1,2,3,4,5], 
                   'min_samples_split': [1,2,3,4,5],
                   'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
                   'splitter': ['best', 'random']
                  }

In [24]:
parameters_rand = {'bootstrap': [True, False],
                   'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                   'max_features': ['auto', 'sqrt'],
                   'min_samples_leaf': [1, 2, 4],
                    'min_samples_split': [2, 5, 10],
                    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
                  }

In [25]:
dicc_parametros={"parameters_lin":parameters_lin,"parameters_ridge":parameters_ridge,"parameters_lasso":parameters_lasso,"parameters_sgd": parameters_sgd,"parameters_knn":parameters_knn,"parameters_grad":parameters_grad,"parameter_elast":parameter_elast,"parameters_svr":parameters_svr,"paremeters_tree":paremeters_tree,"parameters_rand":parameters_rand}

In [26]:
#for name,model in models.items():
   # for k,v in dicc_parametros.items():
       # grid = GridSearchCV(models[name], dicc_parametros[k], verbose=1)
      

#### Linear Regression:

In [27]:
lin=models["lin"]

In [28]:
grid_lin = GridSearchCV(models["lin"], dicc_parametros['parameters_lin'], verbose=1)

In [29]:
grid_lin.fit(X_train,y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


GridSearchCV(estimator=LinearRegression(),
             param_grid={'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'n_jobs': [-2, -1, 1, 2, None],
                         'positive': [True, False]},
             verbose=1)

In [30]:
print(grid_lin.best_params_)

{'copy_X': True, 'fit_intercept': True, 'n_jobs': -2, 'positive': False}


#### Ridge:

In [31]:
ridge=models["ridge"]

In [32]:
grid_ridge = GridSearchCV(ridge, parameters_ridge, verbose=1)

In [33]:
grid_ridge.fit(X_train,y_train)

Fitting 5 folds for each of 8640 candidates, totalling 43200 fits


GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [1.0, 2.0, 3.0, 4.0], 'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'max_iter': [1000, 15000, None],
                         'positive': [True, False],
                         'random_state': [1, 2, 3, 4, 5, None],
                         'solver': ['auto', 'svd', 'cholesky', 'lsqr',
                                    'sparse_cg'],
                         'tol': [0.001, 0.002, 0.003]},
             verbose=1)

In [34]:
print(grid_ridge.best_params_)

{'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'positive': False, 'random_state': 1, 'solver': 'auto', 'tol': 0.001}


#### Lasso:

In [35]:
lasso=models["lasso"]

In [36]:
grid_lasso = GridSearchCV(lasso, parameters_lasso, verbose=1)

In [37]:
grid_lasso.fit(X_train,y_train)

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits


GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [0.0, 1.0], 'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'max_iter': [500, 1000, 1500],
                         'positive': [True, False], 'precompute': [True, False],
                         'random_state': [True, False],
                         'selection': ['cyclic', 'random'],
                         'tol': [0.0001, 0.0002], 'warm_start': [True, False]},
             verbose=1)

In [38]:
print(grid_lasso.best_params_)

{'alpha': 0.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 500, 'positive': False, 'precompute': True, 'random_state': True, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': True}


#### sgd:

In [39]:
sgd=models["sgd"]

In [40]:
grid_sgd = GridSearchCV(sgd, parameters_sgd, verbose=1)

In [None]:
grid_sgd.fit(X_train,y_train)

### Entreno los modelos con sus mejores parámetros:

In [44]:
best={'lin':LinearRegression(copy_X=True, fit_intercept=True, n_jobs= -2, positive= False),
    'ridge': Ridge(alpha= 2.0, copy_X= True, fit_intercept= True, max_iter= 1000, positive= False, random_state= 1, solver= 'auto', tol= 0.001),
    'lasso': Lasso(alpha= 0.0, copy_X= True, fit_intercept= True, max_iter= 500, positive= False, precompute= True, random_state= True, selection= 'cyclic', tol= 0.0001, warm_start= True),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
    'elas': ElasticNet(),
    'svr' :LinearSVR(),
    'tree': DecisionTreeRegressor(),
    'random': RandomForestRegressor()
        }

In [48]:
for name, model in best.items():
    print("ENTRENANDO: ", name)
    model.fit(X, y)


ENTRENANDO:  lin
ENTRENANDO:  ridge
ENTRENANDO:  lasso
ENTRENANDO:  sgd
ENTRENANDO:  knn
ENTRENANDO:  grad
ENTRENANDO:  elas
ENTRENANDO:  svr
ENTRENANDO:  tree
ENTRENANDO:  random


In [70]:
lin=LinearRegression(copy_X=True, fit_intercept=True, n_jobs= -2, positive= False)

In [71]:
lin.fit(X_train,y_train)

LinearRegression(n_jobs=-2)

In [65]:
for name, model in best.items():
    y_pred = model.predict(X_test)
    #print(f"------{name}------")
    #print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [67]:
y_pred_lin=lin.predict(X_test)

In [68]:
y_pred_lin

array([3165.18579645, 3615.77471101, 4781.43065976, ..., 6868.54310125,
        -48.82786001, 2290.77013005])