In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,RidgeCV,SGDRegressor,ElasticNet,Lars,Lasso,OrthogonalMatchingPursuit
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn import metrics
import sklearn.metrics

## Exploremos que modelo(s) utilizar
**Vamos a proceder a comparar el resultado de algunos modelos lineales y no lineales.**

In [2]:
dataset_train = pd.read_csv('../1.Limpieza_dataraw_inicial/Data_output/train_clean.csv', index_col=0)
dataset_train

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.12,5,3,3,61.6,59.0,5363
1,1.14,3,6,2,60.0,54.0,5593
2,0.90,3,7,2,60.3,63.0,3534
3,0.71,4,6,5,61.9,54.0,3212
4,0.34,3,5,2,60.0,62.0,447
...,...,...,...,...,...,...,...
40450,0.90,2,7,3,63.8,58.0,4120
40451,0.35,4,7,7,62.0,54.0,1266
40452,0.90,2,4,2,58.4,55.0,3269
40453,1.21,2,3,3,63.6,56.0,5893


Dividimos el dataset en dos bloques,de  entrenamiento(las columnas que no son price) y validación (la columna price) del modelo.(Train_test_split)

In [5]:
X_train, X_test, y_train, y_test  = train_test_split(dataset_train.drop(columns="price"),dataset_train.price, test_size=0.2)
train_split=[X_train, X_test, y_train, y_test ]

## Comencemos por probar modelos lineales


In [6]:
lineral_models = {
          "LinearRegression":LinearRegression(),
          "Ridge":Ridge(),
          "SGDRegressor":SGDRegressor(),
          "ElasticNet":ElasticNet(),
          "Lars":Lars(),
          "Lasso":Lasso(),
          "OrthogonalMatchingPursuit":OrthogonalMatchingPursuit()
         }

Entrenamos el modelo

In [7]:
for name, model in lineral_models.items():
    print(f"Training {name}")
    model.fit(X_train,y_train)

Training LinearRegression
Training Ridge
Training SGDRegressor
Training ElasticNet
Training Lars
Training Lasso
Training OrthogonalMatchingPursuit


Analizamos el R-cuadrado y Error cuadrático.

- R2: Va de 0 a 1, con 0 indicando que el modelo propuesto no mejora la predicción sobre el modelo medio y 1 indica una predicción perfecta 

- RMSE:Los valores más bajos de RMSE indican un mejor ajuste.

In [8]:
for name, model in lineral_models.items():
    y_pred = model.predict(X_test)
    print(f"-------{name}-------")
    print("r2 score",round(r2_score(y_test, y_pred),4))
    print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),4)

-------LinearRegression-------
r2 score 0.9038
RMSE: 1223.0 4
-------Ridge-------
r2 score 0.9038
RMSE: 1223.0 4
-------SGDRegressor-------
r2 score -412366690.5058
RMSE: 80099752.0 4
-------ElasticNet-------
r2 score 0.4421
RMSE: 2946.0 4
-------Lars-------
r2 score 0.9038
RMSE: 1223.0 4
-------Lasso-------
r2 score 0.9038
RMSE: 1223.0 4
-------OrthogonalMatchingPursuit-------
r2 score 0.8437
RMSE: 1560.0 4


## Probemos ahora con los modelos no lineales

In [9]:
models_non_linear={
    "ExtraTreesRegressor":ExtraTreesRegressor(),
    "GradientBoostingRegressor":GradientBoostingRegressor(),
    "RandomForestRegressor":RandomForestRegressor()
    }

In [10]:
for name, model in models_non_linear.items():
    print(f"Training {name}")
    model.fit(X_train,y_train)

Training ExtraTreesRegressor
Training GradientBoostingRegressor
Training RandomForestRegressor


In [11]:
for name, model in models_non_linear.items():
    y_pred = model.predict(X_test)
    print(f"-------{name}-------")
    print("r2 score",round(r2_score(y_test, y_pred),4))
    print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),4)

-------ExtraTreesRegressor-------
r2 score 0.9788
RMSE: 575.0 4
-------GradientBoostingRegressor-------
r2 score 0.9748
RMSE: 626.0 4
-------RandomForestRegressor-------
r2 score 0.98
RMSE: 558.0 4


Como podemos observar el error cuadratico medio entre nuestro grounthtruth (precio)y la prediccion es muchsimo menor para los modelos no lineales seleccionados.
El R2, tambien aporta metricas ganadoras más proximas al 1.

## Grid Search y Cross Validation

**Escogemos múltiples combinaciones de ajustes de parámetros, y cada uno de ellos los entrenamos de forma cruzada para determinar cuál da el mejor rendimiento.**

**En nuestro caso queremos encontrar los mejores parametros para ExtraTreesRegressor,GradientBoostingRegressor, RandomForestRegressor**

Escogemos los parametros. Cada modelo acepta unos parametros diferentes. Por ej bootstrap no esta en GradientBoostingRegressor, por lo que selecionaremos otros parametros para este modelo.

In [12]:
#explicacion parametros: https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
parameters = { 
                'bootstrap': [True, False],  #explicacion bootstrap:https://stackoverflow.com/questions/54529928/does-bootstrap-features-parameter-is-useful-if-i-use-max-features-parameter-in-r
                'n_estimators': [100,200, 300],
                'max_features': ['auto', 'sqrt', 'log2'],
                'max_depth' : [20,40,60]
                }

parameters_GBR={ 
                'n_estimators': [100,200, 300],
                'max_features': ['auto', 'sqrt', 'log2'],
                'max_depth' : [20,40,60]
                }
#rfc=GradientBoostingRegressor()
#sorted(rfc.get_params().keys())-----Esto es para saber los parametros disponibles por modelo

Hacemos el Cross Validation para los tres modelos

In [13]:
#GS para RandomForestRegressor
rfc = RandomForestRegressor()
grid_RandomForestRegressor = GridSearchCV(estimator=rfc,param_grid=parameters,verbose=1,n_jobs=-1,cv=5)
grid_RandomForestRegressor.fit(X_train,y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 19.4min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True, False], 'max_depth': [20, 40, 60],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 300]},
             verbose=1)

In [14]:
#GS para GradientBoostingRegressor
rfc = GradientBoostingRegressor()
grid_GradientBoostingRegressor = GridSearchCV(estimator=rfc,param_grid=parameters_GBR,verbose=1,n_jobs=-1,cv=5)
grid_GradientBoostingRegressor.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 12.5min finished


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'max_depth': [20, 40, 60],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 200, 300]},
             verbose=1)

In [None]:
#GS para RandomForestRegressor
rfc = ExtraTreesRegressor()
grid_ExtraTreesRegressor = GridSearchCV(estimator=rfc,param_grid=parameters,verbose=1,n_jobs=-1,cv=5)
grid_ExtraTreesRegressor.fit(X_train,y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


¿Que parametros han resultado mejores para cada modelo?

In [None]:
print('El mejor parámetro para RandomForestRegressor es:',grid_RandomForestRegressor.best_params_)
print('El mejor parámetro para GradientBoostingRegressor es:',grid_GradientBoostingRegressor.best_params_)
print('El mejor parámetro para ExtraTreesRegressor es:',grid_ExtraTreesRegressor.best_params_)

Volvemos a entrenar los modelos pero ahora con los parametros ganadores

In [None]:
#Entrenando con el mejor parámetro RandomForestRegressor

model_RandomForestRegressor = RandomForestRegressor(bootstrap=True, max_depth=60, max_features= 'auto', n_estimators= 200)
model_RandomForestRegressor.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(f"-------'RandomForestRegressor'-------")
print("r2 score",round(r2_score(y_test, y_pred),4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),4)

In [None]:
#Entrenando con el mejor parámetro GradientBoostingRegressor
model_GradientBoostingRegressor = GradientBoostingRegressor(max_depth= 60, max_features= 'log2', n_estimators= 300)
model_GradientBoostingRegressor.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(f"-------'GradientBoostingRegressor'-------")
print("r2 score",round(r2_score(y_test, y_pred),4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),4)

In [None]:
#Entrenando con el mejor parámetro ExtraTreesRegressor
model_ExtraTreesRegressor = ExtraTreesRegressor(bootstrap= True, max_depth= 20, max_features= 'auto', n_estimators= 300)
model_ExtraTreesRegressor.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(f"-------'ExtraTreesRegressor'-------")
print("r2 score",round(r2_score(y_test, y_pred),4))
print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),4)


¿Ha habido mejora con el Grid Search y Cross Validation?

In [None]:
Data={
        'r2_score_antesGV':[0.9796,0.9758, 0.9813],
        'r2_score_despuesGv':[0.9813,0.9813,0.9813],
        'RMSE_score_antesGV':[575.04,626.04,551.04],
        'RMSE_score_despuesGv':[551.04,551.04,551.04],
        
    }
comparacion=pd.DataFrame(Data,index=['ExtraTreesRegressor','GradientBoostingRegressor','RandomForestRegressor']).T
comparacion

Casi impredecible,en el **r2** mejora de los dos primeros ,y  el tercero nada no varia.
En **RMSE** , mejora en el primero , empeora en el segundo y tampoco varía aqui el tercero.


# Elegiremos entonces ExtraTreesRegresor con GV y RandomForestRegressor para la primera predicción

In [3]:
dataset_predict= pd.read_csv('../1.Limpieza_dataraw_inicial/Data_output/predict_clean.csv', index_col=0)


## Hacemos la prediccion para ambos modelos

Cogemos los datos paralas variables predictoras(que es el dataset de predict) y la variable objetivo (que es la columna precio del dataset de training)

In [None]:
#AQUIIII ES DONDE TENGO LA DUDA, X E Y DEBEN DE TENER LAS MISMAS ROWS Y SI TE FIJAS LOS DOS DATASETS VARIAN, EN EL NUMERO DE ROWS¡¡¡TENGO QUE PREGUNTAR!

In [4]:
predictors =dataset_predict

targets = dataset_train['price'][0:13485] #cogemos ese numero de filas porque 

Creamos la muestra de entrenamiento y de test, tanto para predictores como para la variable objetivo, siendo test el 0.20

In [5]:
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=0.2)

Inicializamos el algoritmo Random Forest con los parametros ganadores

In [6]:
classifier_1=RandomForestRegressor(bootstrap=True, max_depth=60, max_features= 'auto', n_estimators= 200)

Construimos el modelo sobre los datos de entrenamiento

In [10]:
classifier_1=classifier_1.fit(pred_train,tar_train)

Predecimos para los valores del grupo Test

In [11]:
predictions_1=classifier_1.predict(pred_test)

Hacemos ahora la prediccion para ExtraTreesRegressor

In [12]:
classifier_2=ExtraTreesRegressor(bootstrap= True, max_depth= 20, max_features= 'auto', n_estimators= 300)
classifier_2=classifier_2.fit(pred_train,tar_train)
predictions_2=classifier_2.predict(pred_test)

In [42]:

import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn import datasets

print(classifier_2.feature_importances_)

[0.06718794 0.11953816 0.13485028 0.4673668  0.21105683]


Pasamos ambas predicciones a datframe con el formato que debe tener la submission y lo guardamos

In [13]:
submission_1 = pd.DataFrame(predictions_1).reset_index()
submission_1 = submission_1.rename(columns={"index": "id", 0: "price"})
submission_1

Unnamed: 0,id,price
0,0,2227.084000
1,1,3244.482000
2,2,2876.694083
3,3,2108.886333
4,4,4399.575000
...,...,...
2692,2692,4238.924357
2693,2693,2403.442167
2694,2694,5387.152500
2695,2695,3323.383446


In [29]:
submission_2 = pd.DataFrame(predictions_2).reset_index()
submission_2 = submission_2.rename(columns={"index": "id", 0: "price"})
submission_2.iloc[0:5,:]

Unnamed: 0,id,price
0,0,3086.389638
1,1,2651.178552
2,2,3034.642847
3,3,2337.83379
4,4,5443.951888


Guardamos ambas summission

In [15]:
submission_1.to_csv('../1.Limpieza_dataraw_inicial/Data_output/predict_Random_forest.csv', header= True, index=False)
submission_2.to_csv('../1.Limpieza_dataraw_inicial/Data_output/predict_Extra_trees.csv', header= True, index=False)

In [None]:
from fitter import Fitter, get_common_distributions

distribuciones = ['cauchy', 'chi2', 'expon',  'exponpow', 'gamma',
                  'norm', 'powerlaw', 'beta', 'logistic']

fitter = Fitter(datos.precio, distributions=distribuciones, verbose=False)
fitter.fit()
fitter.summary(Nbest=10, plot=False)