# Librerias

In [1]:
# Importamos modelos de arboles
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# Importamos modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
import xgboost as xgb

# Importamos los meta modelos de arboles
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd

# Trees

## Cart (classification and regression trees)

In [2]:
# Cargamos los datos
cancer = pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/cancer.csv')
cancer.head(3)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [3]:
# obtenemos dummies
cancer_dummies = (pd.get_dummies(cancer,
                                columns = ['diagnosis'],
                                drop_first = [True],
                                prefix = ['D'])
                .loc[:,['D_M','radius_mean', 'concave points_mean']]
                )

cancer_dummies.head(3)

Unnamed: 0,D_M,radius_mean,concave points_mean
0,1,17.99,0.1471
1,1,20.57,0.07017
2,1,19.69,0.1279


In [4]:
# obtenemos los arreglos numpy
X = cancer_dummies.copy().drop(columns=['D_M']).values
y = cancer_dummies.copy()['D_M'].values

In [5]:
# partimos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    stratify=y,
                                                    random_state=1)

In [6]:
# instanciamos y ajustamos el modelo
dt = DecisionTreeClassifier(max_depth=4, 
                            random_state=1)
dt.fit(X_train, y_train)

In [7]:
# Generamos pronosticos
y_pred = dt.predict(X_test)

In [8]:
# obtenemos accuracy
print('accuracy: ', accuracy_score(y_test, y_pred))
print('-'*30)
# obtenemos precision
print('precision:', precision_score(y_test, y_pred))
print('-'*30)
# obtenemos recall
print('recall:   ', recall_score(y_test, y_pred))
print('-'*30)
# obtenemos f1
print('f1:       ', f1_score(y_test, y_pred))

accuracy:  0.9035087719298246
------------------------------
precision: 0.8444444444444444
------------------------------
recall:    0.9047619047619048
------------------------------
f1:        0.8735632183908046


In [9]:
model_cart = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
            recall_score(y_test, y_pred), f1_score(y_test, y_pred), 'Cart']
model_cart

[0.9035087719298246,
 0.8444444444444444,
 0.9047619047619048,
 0.8735632183908046,
 'Cart']

### Cart with Entropy

In [10]:
# Entrenamos usando Entropía como criterio de información

dt_entropy = DecisionTreeClassifier(max_depth=8,
                                    criterion='entropy',
                                    random_state=1)

dt_entropy.fit(X_train,y_train)
y_pred = dt_entropy.predict(X_test)

In [11]:
# obtenemos accuracy
print('accuracy: ', accuracy_score(y_test, y_pred))
print('-'*30)
# obtenemos precision
print('precision:', precision_score(y_test, y_pred))
print('-'*30)
# obtenemos recall
print('recall:   ', recall_score(y_test, y_pred))
print('-'*30)
# obtenemos f1
print('f1:       ', f1_score(y_test, y_pred))

accuracy:  0.8859649122807017
------------------------------
precision: 0.8536585365853658
------------------------------
recall:    0.8333333333333334
------------------------------
f1:        0.8433734939759037


In [12]:
model_cart_entropy = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
                recall_score(y_test, y_pred), f1_score(y_test, y_pred), 'Cart Entropy']
model_cart_entropy

[0.8859649122807017,
 0.8536585365853658,
 0.8333333333333334,
 0.8433734939759037,
 'Cart Entropy']

### Cart with Gini

In [13]:
# Entrenamos usando GINI como criterio de información

dt_entropy = DecisionTreeClassifier(max_depth=8,
                                    criterion='gini',
                                    random_state=1)

dt_entropy.fit(X_train,y_train)

# Realizamos la predicción
y_pred = dt_entropy.predict(X_test)

In [14]:
# obtenemos accuracy
print('accuracy: ', accuracy_score(y_test, y_pred))
print('-'*30)
# obtenemos precision
print('precision:', precision_score(y_test, y_pred))
print('-'*30)
# obtenemos recall
print('recall:   ', recall_score(y_test, y_pred))
print('-'*30)
# obtenemos f1
print('f1:       ', f1_score(y_test, y_pred))

accuracy:  0.9210526315789473
------------------------------
precision: 0.9024390243902439
------------------------------
recall:    0.8809523809523809
------------------------------
f1:        0.8915662650602411


In [15]:
model_cart_gini = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
                recall_score(y_test, y_pred), f1_score(y_test, y_pred), 'Cart Gini']
model_cart_gini

[0.9210526315789473,
 0.9024390243902439,
 0.8809523809523809,
 0.8915662650602411,
 'Cart Gini']

### Trees for Regresion

In [16]:
# Cargamos los datos
auto = (pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/mpg.csv')
        .dropna()
        .loc[:,['displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'cylinders', 'mpg']]
        )
auto.head(3)

Unnamed: 0,displacement,horsepower,weight,acceleration,model_year,origin,cylinders,mpg
0,307.0,130.0,3504,12.0,70,usa,8,18.0
1,350.0,165.0,3693,11.5,70,usa,8,15.0
2,318.0,150.0,3436,11.0,70,usa,8,18.0


In [17]:
# binarizamos las columnas
auto = pd.get_dummies(auto,
                    columns = ['origin'],
                    drop_first = [True], # Tiramos la primera categoria para no tener problemas de multicolinealidad
                    prefix = ['o']
                    )
# Inspeccionamos los datos nuevamente
auto.head(3)

Unnamed: 0,displacement,horsepower,weight,acceleration,model_year,cylinders,mpg,o_japan,o_usa
0,307.0,130.0,3504,12.0,70,8,18.0,0,1
1,350.0,165.0,3693,11.5,70,8,15.0,0,1
2,318.0,150.0,3436,11.0,70,8,18.0,0,1


In [18]:
# obtenemos los arreglos numpy
X = auto.copy().drop(columns=['mpg']).values
y = auto.copy()['mpg'].values

In [19]:
# Partimos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state=1)

In [20]:
# Instanciamos y ajustamos un modelo
dt = DecisionTreeRegressor(max_depth=4,
                            min_samples_leaf = 0.4, # es un porcentaje, si el siguiente nivel tiene hojas menores al parametro, me quedo en la hoja anterior 
                            random_state=3)

# entrenamos el modelo
dt.fit(X_train, y_train)

# generamos pronosticos
y_pred = dt.predict(X_test)

In [21]:
# obtenemos algunas metricas
# RMSE
MSE(y_test, y_pred)**(1/2) 

5.548589104121745

In [22]:
auto_rmse_cart = [MSE(y_test, y_pred)**(1/2), 'Cart']
auto_rmse_cart

[5.548589104121745, 'Cart']

## Ensemble Learning

In [23]:
# Cargamos los datos
cancer = pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/cancer.csv')

# obtenemos dummies
cancer_dummies = (pd.get_dummies(cancer,
                                columns = ['diagnosis'],
                                drop_first = [True],
                                prefix = ['D'])
                    .loc[:,['D_M','radius_mean', 'concave points_mean']]
                    )

In [24]:
# obtenemos los arreglos numpy
X = cancer_dummies.copy().drop(columns=['D_M']).values
y = cancer_dummies.copy()['D_M'].values

# partimos los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size= 0.50,
                                                    random_state= 123)

In [25]:
# Instanciamos los clasificadores individuales
lr = LogisticRegression(random_state=123)
knn = KNN()
dt = DecisionTreeClassifier(random_state=123)

# Definimos una lista de tuplas con (nombre_del_clasificador, clasificador)
classifiers = [('Logistic Regression', lr),
                ('K Nearest Neighbours', knn),
                ('Classification Tree', dt)]

In [26]:
# Para comparar, veamos los resultados individuales 

for clf_name, clf in classifiers:
    # Ajustamos sobre train    
    clf.fit(X_train, y_train)
    # Generamos pronosticos con test    
    y_pred = clf.predict(X_test)
    # Evaluamos las metricas
    print('-'*30)
    print(clf_name)
    print('accuracy: ', accuracy_score(y_test, y_pred))
    print('precision:', precision_score(y_test, y_pred))
    print('recall:   ', recall_score(y_test, y_pred))
    print('f1:       ', f1_score(y_test, y_pred))

------------------------------
Logistic Regression
accuracy:  0.887719298245614
precision: 0.865979381443299
recall:    0.8155339805825242
f1:        0.8399999999999999
------------------------------
K Nearest Neighbours
accuracy:  0.9052631578947369
precision: 0.9318181818181818
recall:    0.7961165048543689
f1:        0.8586387434554973
------------------------------
Classification Tree
accuracy:  0.9017543859649123
precision: 0.8640776699029126
recall:    0.8640776699029126
f1:        0.8640776699029126


In [27]:
# Instanciamos el VotingClassifier 'vc'
vc = VotingClassifier(estimators=classifiers) 
# Ajustamos sobre train
vc.fit(X_train, y_train)   
y_pred = vc.predict(X_test)
# Evaluamos las metricas
print('-'*30)
print('ensambe VotingClassifier')
print('accuracy: ', accuracy_score(y_test, y_pred))
print('precision:', precision_score(y_test, y_pred))
print('recall:   ', recall_score(y_test, y_pred))
print('f1:       ', f1_score(y_test, y_pred))

------------------------------
ensambe VotingClassifier
accuracy:  0.9157894736842105
precision: 0.9438202247191011
recall:    0.8155339805825242
f1:        0.8749999999999999


In [28]:
model_vc = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
                recall_score(y_test, y_pred), f1_score(y_test, y_pred), 'Ensambe VotingClassifier']
model_vc

[0.9157894736842105,
 0.9438202247191011,
 0.8155339805825242,
 0.8749999999999999,
 'Ensambe VotingClassifier']

## Bagging

In [29]:
# Cargamos los datos
cancer = pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/cancer.csv')

# obtenemos dummies
cancer_dummies = (pd.get_dummies(cancer,
                                columns = ['diagnosis'],
                                drop_first = [True],
                                prefix = ['D'])
                    .loc[:,['D_M','radius_mean', 'concave points_mean']]
                    )

In [30]:
# obtenemos los arreglos numpy
X = cancer_dummies.copy().drop(columns=['D_M']).values
y = cancer_dummies.copy()['D_M'].values

# Partimos los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    stratify=y,
                                                    random_state=123)

In [31]:
# Instanciamos un regresión logística - lr
lr = LogisticRegression()

In [32]:
# Instanciamos un BaggingClassifier - bc
bc = BaggingClassifier(base_estimator=lr, 
                        n_estimators=40, # Cantidad de modelos que seran usados
                        n_jobs=-1) # swifter 

# Ajustamos el modelo
bc.fit(X_train, y_train)

In [33]:
# Creamos la predicción
y_pred = bc.predict(X_test)

# Metricas

# obtenemos accuracy
print('accuracy: ', accuracy_score(y_test, y_pred))
print('-'*30)
# obtenemos precision
print('precision:', precision_score(y_test, y_pred))
print('-'*30)
# obtenemos recall
print('recall:   ', recall_score(y_test, y_pred))
print('-'*30)
# obtenemos f1
print('f1:       ', f1_score(y_test, y_pred))

accuracy:  0.8947368421052632
------------------------------
precision: 0.8709677419354839
------------------------------
recall:    0.84375
------------------------------
f1:        0.8571428571428571


In [34]:
model_bc = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
            recall_score(y_test, y_pred), f1_score(y_test, y_pred), 'Bagging Classifier']
model_bc

[0.8947368421052632,
 0.8709677419354839,
 0.84375,
 0.8571428571428571,
 'Bagging Classifier']

## Random Forests

In [35]:
# Cargamos los datos
auto = (pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/mpg.csv')
        .dropna()
        .loc[:,['displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'cylinders', 'mpg']]
        )

# binarizamos las columnas
auto = pd.get_dummies(auto,
                    columns = ['origin'],
                    drop_first = [True],
                    prefix = ['o']
                    )

# obtenemos los arreglos numpy
X = auto.copy().drop(columns=['mpg'])
y = auto.copy()['mpg'].values

# Partimos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state=1)

In [36]:
# Instanciamos el modelo RandomForestRegressor
rf = RandomForestRegressor(n_estimators=25,
                            random_state=2)

# Ajustamos sobre datos de entrenamiento    
rf.fit(X_train, y_train) 

In [37]:
# Generamos pronosticos
y_pred = rf.predict(X_test)

rmse_test = MSE(y_test, y_pred)**(1/2)

print('RMSE: {:.2f}'.format(rmse_test))

RMSE: 3.16


In [38]:
auto_rmse_rf = [rmse_test, 'Random forest Regressor']
auto_rmse_rf

[3.156960435272102, 'Random forest Regressor']

In [39]:
# que tanto aparecieron las variables en los arboles
rf.feature_importances_

array([0.44415779, 0.15844496, 0.14097122, 0.02980621, 0.10038401,
       0.12087081, 0.00268571, 0.00267929])

In [40]:
# que tan buenas son mis variables
rf.feature_names_in_


array(['displacement', 'horsepower', 'weight', 'acceleration',
       'model_year', 'cylinders', 'o_japan', 'o_usa'], dtype=object)

In [41]:
(pd.DataFrame(zip(rf.feature_importances_, rf.feature_names_in_), columns = ['importnacias', 'features'])
    .sort_values(by = 'importnacias', ascending = False)
)

Unnamed: 0,importnacias,features
0,0.444158,displacement
1,0.158445,horsepower
2,0.140971,weight
5,0.120871,cylinders
4,0.100384,model_year
3,0.029806,acceleration
6,0.002686,o_japan
7,0.002679,o_usa


## Boosting (modelos secuenciales)

In [42]:
# Cargamos los datos
cancer = pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/cancer.csv')

# obtenemos dummies
cancer_dummies = (pd.get_dummies(cancer,
                                columns = ['diagnosis'],
                                drop_first = [True],
                                prefix = ['D'])
                    .loc[:,['D_M','radius_mean', 'concave points_mean']]
                    )

In [43]:
# obtenemos los arreglos numpy
X = cancer_dummies.copy().drop(columns=['D_M']).values
y = cancer_dummies.copy()['D_M'].values

# partimos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=123)

In [44]:
# Instanciamos un classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth=2, 
                            random_state=123)
# Instanciamos AdaBoost classifier 'adab_clf'
adb_clf = AdaBoostClassifier(base_estimator=dt, 
                                n_estimators=100)

In [45]:
# Ajustamos el modelo
adb_clf.fit(X_train, y_train)
# generamos pronosticos
y_pred = adb_clf.predict(X_test)

In [46]:
# obtenemos accuracy
print('accuracy: ', accuracy_score(y_test, y_pred))
print('-'*30)
# obtenemos precision
print('precision:', precision_score(y_test, y_pred))
print('-'*30)
# obtenemos recall
print('recall:   ', recall_score(y_test, y_pred))
print('-'*30)
# obtenemos f1
print('f1:       ', f1_score(y_test, y_pred))

accuracy:  0.9122807017543859
------------------------------
precision: 0.855072463768116
------------------------------
recall:    0.921875
------------------------------
f1:        0.887218045112782


In [47]:
model_adb = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
                recall_score(y_test, y_pred), f1_score(y_test, y_pred), 'Ada boost']
model_adb

[0.9122807017543859,
 0.855072463768116,
 0.921875,
 0.887218045112782,
 'Ada boost']

## Gradient Boosting

In [48]:
# Cargamos los datos

auto = (pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/mpg.csv')
        .dropna()
        .loc[:,['displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'cylinders', 'mpg']]
        )

# binarizamos las columnas
auto = pd.get_dummies(auto,
                columns = ['origin'],
                drop_first = [True],
                prefix = ['o']
                )

# obtenemos los arreglos numpy
X = auto.copy().drop(columns=['mpg']).values
y = auto.copy()['mpg'].values

In [49]:
# partimos en train y test
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.3,
                                                    random_state=123)

In [50]:
# Instanciamos un GradientBoostingRegressor 'gbt'
gbt = GradientBoostingRegressor(n_estimators=300, 
                                max_depth=2,
                                learning_rate = 0.01, 
                                random_state=123)

# Ajustamos el modelo
gbt.fit(X_train, y_train)

# Generamos pronosticos
y_pred = gbt.predict(X_test)

In [51]:
# Evaluamos RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)
rmse_test

3.1230874535631474

In [52]:
auto_rmse_gb = [rmse_test, 'Gradient Boosting Regressor']
auto_rmse_gb

[3.1230874535631474, 'Gradient Boosting Regressor']

## Stochastic Gradient Boosting

In [53]:
auto = (pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/mpg.csv')
        .dropna()
        .loc[:,['displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'cylinders', 'mpg']]
        )

# binarizamos las columnas
auto = pd.get_dummies(auto,
                    columns = ['origin'],
                    drop_first = [True],
                    prefix = ['o']
                    )

# obtenemos los arreglos numpy
X = auto.copy().drop(columns=['mpg']).values
y = auto.copy()['mpg'].values

In [54]:
# Partimos en train y test
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.3,
                                                    random_state=123)

In [55]:
# Instanciamos el stochastic GradientBoostingRegressor 'sgbt'
sgbt = GradientBoostingRegressor(max_depth=3,
                                subsample=0.8,
                                max_features=0.2,
                                n_estimators=300,
                                random_state=123,
                                learning_rate = 0.01)

# Ajustamos el modelo
sgbt.fit(X_train, y_train)

In [56]:
# Generamos pronosticos
y_pred = sgbt.predict(X_test)

In [57]:
# Evaluate RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)
rmse_test

3.212418146542329

In [58]:
auto_rmse_sgbt = [rmse_test, 'Stochastic Gradient Boosting Regressor']
auto_rmse_sgbt

[3.212418146542329, 'Stochastic Gradient Boosting Regressor']

## XGBoost

In [59]:
cancer = pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/cancer.csv')

cancer_dummies = (pd.get_dummies(cancer,
                                columns = ['diagnosis'],
                                drop_first = [True],
                                prefix = ['D'])
                    .loc[:,['D_M','radius_mean', 'concave points_mean']]
                    )

X = cancer_dummies.copy().drop(columns=['D_M']).values
y = cancer_dummies.copy()['D_M'].values

# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

In [60]:
# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', 
                        n_estimators=100, 
                        seed=123,
                        learning_rate = 0.01)

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)

# Predict the labels of the test set: preds
y_pred = xg_cl.predict(X_test)

In [61]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.8859649122807017
0.8888888888888888
0.7804878048780488
0.8311688311688312


In [62]:
model_xg = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred),
                recall_score(y_test, y_pred), f1_score(y_test, y_pred), 'XGBoost']
model_xg

[0.8859649122807017,
 0.8888888888888888,
 0.7804878048780488,
 0.8311688311688312,
 'XGBoost']

# Ejercicios

## Ejercicio 01

Considera los datos de `cancer.csv` y el tratamiento que se le dio en los ejercicos a los datos, ajusta cada uno de los modelos vistos y crea un dataframe que permita comparar las metricas obtenidas

In [63]:
models = [model_cart, model_cart_entropy, model_cart_gini,
            model_vc, model_bc, model_adb, model_xg]

In [64]:
# Respuesta
pd.DataFrame(models, columns=['Accuracy', 'Precision', 'Recall', 'f1 score', 'Model'])

Unnamed: 0,Accuracy,Precision,Recall,f1 score,Model
0,0.903509,0.844444,0.904762,0.873563,Cart
1,0.885965,0.853659,0.833333,0.843373,Cart Entropy
2,0.921053,0.902439,0.880952,0.891566,Cart Gini
3,0.915789,0.94382,0.815534,0.875,Ensambe VotingClassifier
4,0.894737,0.870968,0.84375,0.857143,Bagging Classifier
5,0.912281,0.855072,0.921875,0.887218,Ada boost
6,0.885965,0.888889,0.780488,0.831169,XGBoost


## Ejercicio 02

Considera los datos de `auto.csv` y el tratamiento que se le dio en los ejercicos a los datos, ajusta cada uno de los modelos vistos y crea un dataframe que permita comparar las metricas obtenidas

In [65]:
auto_models = [auto_rmse_cart, auto_rmse_rf, auto_rmse_gb, auto_rmse_sgbt]

In [66]:
# Respuesta
model_full_features = pd.DataFrame(auto_models, columns=['RMSE_full_features', 'Name_model'])
model_full_features

Unnamed: 0,RMSE_full_features,Name_model
0,5.548589,Cart
1,3.15696,Random forest Regressor
2,3.123087,Gradient Boosting Regressor
3,3.212418,Stochastic Gradient Boosting Regressor


In [67]:
def auto_(column):

    name_model = ['Cart', 'Random forest Regressor', 'Gradient Boosting Regressor', 'Stochastic Gradient Boosting Regressor']
    auto_models_ = []

    auto = (pd.read_csv('https://raw.githubusercontent.com/edroga/Datasets_for_projects/main/mpg.csv')
            .dropna()
            .loc[:,['displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'cylinders', 'mpg']]
            )

    # binarizamos las columnas
    auto = pd.get_dummies(auto,
                        columns = ['origin'],
                        drop_first = [True],
                        prefix = ['o']
                        )

    # obtenemos los arreglos numpy
    X = auto.copy().drop(columns=['mpg', column]).values
    y = auto.copy()['mpg'].values

    rmse_test = 0
    for name in name_model:
        size = 0.2 if 'Cart' in name or 'Random' in name else 0.3
        seed = 1 if 'Cart' in name or 'Random' in name else 123
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=seed)

        if 'Cart' in name:
            dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf = 0.4, random_state=3)
            dt.fit(X_train, y_train)
            y_pred = dt.predict(X_test)

        elif 'Random' in name:
            rf = RandomForestRegressor(n_estimators=25,
                                    random_state=2)
            rf.fit(X_train, y_train) 
            y_pred = rf.predict(X_test)
        elif 'Gradient' in name and 'Stochastic' not in name:
            gbt = GradientBoostingRegressor(n_estimators=300, max_depth=2,
                                        learning_rate = 0.01, random_state=123)
            gbt.fit(X_train, y_train)
            y_pred = gbt.predict(X_test)
        elif 'Stochastic' in name:
            sgbt = GradientBoostingRegressor(max_depth=3, subsample=0.8, max_features=0.2,
                                        n_estimators=300, random_state=123, learning_rate = 0.01)
            sgbt.fit(X_train, y_train)
            y_pred = sgbt.predict(X_test)

        rmse_test = MSE(y_test, y_pred)**(1/2)
        auto_models_.append((rmse_test, name))

    return auto_models_

In [68]:
model_without_dis = pd.DataFrame(auto_('displacement'), columns= ['RMSE_without_displacement', 'Name_model'])
model_without_dis

Unnamed: 0,RMSE_without_displacement,Name_model
0,5.382259,Cart
1,3.238435,Random forest Regressor
2,2.950702,Gradient Boosting Regressor
3,3.185957,Stochastic Gradient Boosting Regressor


In [69]:
model_without_ousa = pd.DataFrame(auto_('o_usa'), columns= ['RMSE_without_o_usa', 'Name_model'])
model_without_ousa

Unnamed: 0,RMSE_without_o_usa,Name_model
0,5.548589,Cart
1,3.230759,Random forest Regressor
2,3.124883,Gradient Boosting Regressor
3,3.143765,Stochastic Gradient Boosting Regressor


In [70]:
pd.concat([model_full_features.iloc[:,0], model_without_dis.iloc[:,0], model_without_ousa], axis=1)

Unnamed: 0,RMSE_full_features,RMSE_without_displacement,RMSE_without_o_usa,Name_model
0,5.548589,5.382259,5.548589,Cart
1,3.15696,3.238435,3.230759,Random forest Regressor
2,3.123087,2.950702,3.124883,Gradient Boosting Regressor
3,3.212418,3.185957,3.143765,Stochastic Gradient Boosting Regressor
