In [12]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
seed=42
np.random.seed(seed)

In [3]:
X,y=load_iris(return_X_y=True)

In [58]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,shuffle=True,stratify=y)

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler(feature_range=(0,1))

scaler.fit(x_train) #obtenemos los datos de escalamiento en los conjuntos de entrenamiento
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [21]:
#obtener un metamodelo (principal)
model_tree=DecisionTreeClassifier()
#numero de estimadores
n_estimators=600
#comenzamos con el metodo de embolsado-> bagging classifier
bagging=BaggingClassifier(base_estimator=model_tree,
                          n_estimators=n_estimators,
                          random_state=seed
                         )

In [22]:
bagging.fit(x_train,y_train);

In [23]:
bagging.score(x_train,y_train)

1.0

In [24]:
bagging.score(x_test,y_test)

0.9473684210526315

In [25]:
#validacion cruzada sobre los datos
from sklearn.model_selection import cross_validate

In [48]:
result=cross_validate(bagging,x_train,y_train,scoring=["accuracy"],return_train_score=True,cv=10)

In [49]:
result.keys()

dict_keys(['fit_time', 'score_time', 'test_accuracy', 'train_accuracy'])

In [50]:
result["test_accuracy"].mean(),result["test_accuracy"].std()

(0.946969696969697, 0.07017294652672369)

In [51]:
result["train_accuracy"].mean(),result["train_accuracy"].std()

(1.0, 0.0)

In [52]:
from sklearn.metrics import classification_report

In [53]:
y_pred=bagging.predict(x_test)
report=classification_report(y_test,y_pred)

In [54]:
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.92      0.92      0.92        13
           2       0.92      0.92      0.92        13

    accuracy                           0.95        38
   macro avg       0.95      0.95      0.95        38
weighted avg       0.95      0.95      0.95        38



In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

forest=RandomForestClassifier()
param_grid={
    "max_depth":range(1,21),
    "n_estimators":100*np.arange(1,11)
}
cv=StratifiedKFold(n_splits=10,random_state=seed,shuffle=True)
grid=GridSearchCV(forest,
                  param_grid=param_grid,
                  cv=cv,
                  scoring="accuracy",n_jobs=-1)

In [62]:
grid.fit(x_train,y_train);

In [63]:
grid.best_score_

0.9809090909090908

In [64]:
model=grid.best_estimator_

In [65]:
y_pred=model.predict(x_test)
report=classification_report(y_test,y_pred)

In [66]:
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.93      0.97        15
           2       0.94      1.00      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [79]:
parameters=grid.best_params_

In [87]:
#entonces el mejor modelo es model
import joblib

joblib.dump(model,"classifier_iris.pkl")

['classifier_iris.pkl']

In [83]:
#vamos a crear una canalizacion de los datos para automatizar el preprocesamiento
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [99]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=seed,shuffle=True,stratify=y)

In [147]:
#la canalizacion usa las caracteristicas numericas y categoricas
#todos nuestros datos tienen caracteritisticas categoricas

numerical_features=slice()
numerical_transformer=Pipeline([
    ("inpute",SimpleImputer(strategy="mean")),
    ("scaler",MinMaxScaler(feature_range=(0,1)))
    ])

transformer=ColumnTransformer(
    [("numerical",numerical_transformer,numerical_features)],  
    remainder="drop"
    )


In [148]:
#ahora combinamos las canalizacion con el modelo
pipeline_model=Pipeline([("transformer",transformer),
                         ("modelo_forest",RandomForestClassifier(**parameters))
                        ])

In [149]:
pipeline_model.fit(x_train,y_train);

In [150]:
pipeline_model.predict_proba([[0.1,0.2,0.3,0.4]])

array([[0.88940805, 0.10813007, 0.00246188]])

In [151]:
y_test=pipeline_model.predict(x_test)
report=classification_report(y_test,y_pred)

print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        16

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [152]:
#mostramos la representacion HTML de la canalizacion
from sklearn import set_config

set_config(display="diagram")

In [153]:
pipeline_model

In [154]:
import pandas as pd
features_content=pd.DataFrame([[0.1,0.2,0.3,0.4],
                               [2.3,5.6,7.8,0.9],
                               [0.4,5.6,None,9.2],
                               [0.1,0.2,0.3,0.4]])
features_content.head()

Unnamed: 0,0,1,2,3
0,0.1,0.2,0.3,0.4
1,2.3,5.6,7.8,0.9
2,0.4,5.6,,9.2
3,0.1,0.2,0.3,0.4


In [155]:
pipeline_model.predict(features_content)

array([0, 1, 2, 0])

In [156]:
features_content["predict label"]=pipeline_model.predict(features_content)

In [157]:
features_content.head()

Unnamed: 0,0,1,2,3,predict label
0,0.1,0.2,0.3,0.4,0
1,2.3,5.6,7.8,0.9,1
2,0.4,5.6,,9.2,2
3,0.1,0.2,0.3,0.4,0


In [181]:
pipeline_model.predict(features_content.iloc[:,:4])

array([0, 1, 2, 0])

In [182]:
joblib.dump(pipeline_model,"pipeline_model_classification_iris.pkl")

['pipeline_model_classification_iris.pkl']

In [183]:
canal=joblib.load("pipeline_model_classification_iris.pkl")

In [184]:
canal.predict(features_content.iloc[:,:4])

array([0, 1, 2, 0])

## Modelo de Voting Classifier
* Se esxperimentara con un tipo de modelo de conjunto cuya salida sera igual a la prediccion de mayor proporcion entre los distintos modelos entrenados. Las predicciones se votan y la clase que gane es la salida.

In [204]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

list_models=[("RandomForest",RandomForestClassifier(**parameters)),
             ("SVC",SVC(probability=True)),
             ("Logistic",LogisticRegression(max_iter=200)),
             ("Kneighbors",KNeighborsClassifier())
            ]

In [205]:
from sklearn.ensemble import VotingClassifier

voting=VotingClassifier(estimators=list_models,voting="soft")

In [214]:
voting.fit(transformer.fit_transform(x_train),y_train);

In [215]:
voting.score(transformer.transform(x_train),y_train)

0.9714285714285714

In [216]:
voting.score(transformer.transform(x_test),y_test)

0.9111111111111111

In [217]:
y_pred=voting.predict(transformer.transform(x_test))

In [218]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.78      1.00      0.88        14
           2       1.00      0.75      0.86        16

    accuracy                           0.91        45
   macro avg       0.93      0.92      0.91        45
weighted avg       0.93      0.91      0.91        45



In [209]:
pipeline_model.score(x_train,y_train)

0.9809523809523809

In [210]:
pipeline_model.score(x_test,y_test)

1.0

## Modelo de conjunto: Stacking

In [220]:
from sklearn.ensemble import StackingClassifier

base_models=list_models.copy()
final_model=LogisticRegression(max_iter=200)

In [221]:
cv=StratifiedKFold(n_splits=10)
stacking=StackingClassifier(estimators=base_models,final_estimator=final_model,cv=cv)

In [225]:
stacking.fit(x_train,y_train);

In [226]:
stacking.score(x_train,y_train)

0.9809523809523809

In [227]:
stacking.score(x_test,y_test)

0.9777777777777777

## AdaBoostClassifier

* Metamodelo que usa clasificadores debiles para construir uno mas fuerte.
* La estrategie que usa el observar donde se produjeron predicciones incorrectas para luego centrarse en dichos casos y construir un modelo mas fuerte.

In [228]:
from sklearn.ensemble import AdaBoostClassifier


In [230]:
ada=AdaBoostClassifier(n_estimators=100,random_state=seed)

In [231]:
ada.fit(x_train,y_train)

In [232]:
ada.score(x_train,y_train)

0.9619047619047619

In [233]:
ada.score(x_test,y_test)

0.9333333333333333