# Design Pattern 7: Ensembles

> Refere-se a técnicas de aprendizado de máquina que combinam vários modelos de ML e agregam seus resultados para fazer previsões.

### Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier

warnings.filterwarnings('ignore')

### Base de exemplo

In [2]:
df = pd.read_csv('data/sf_robots.csv', usecols=['Original_Book_Title', 'Book_Description', 'Genres']).dropna()
print(df.shape)
df.head(1)

(1239, 3)


Unnamed: 0,Original_Book_Title,Book_Description,Genres
0,"I, Robot","Isaac Asimov's I, Robot launches readers on an...","{'Science Fiction': 6502, 'Fiction': 2523, 'Cl..."


#### Pré-processamento

In [3]:
def getGenres(genres: dict) -> str:
    keys = list(genres.keys())
    return keys[0]

df['Genres'] = df['Genres'].apply(eval)
df['Genres'] = df['Genres'].apply(getGenres)

top_classes = df['Genres'].value_counts().head(3).index
filtered_df = df[df['Genres'].apply(lambda x: any(genre in top_classes for genre in x.split(',')))]

### Prepara a base

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['Book_Description'],
                                                    df['Genres'],
                                                    test_size=0.2,
                                                    random_state=42)

### Modelo simples

In [5]:
pipeline = Pipeline([
    ('countvectorizer', CountVectorizer()),
    ('model', DecisionTreeClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Acurácia do modelo: {accuracy.round(2)*100}%')

Acurácia do modelo: 38.0%


### Bagging

In [6]:
pipeline = Pipeline([
    ('countvectorizer', CountVectorizer()),
    ('bagging_model', BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42))
])

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Acurácia do modelo de bagging: {accuracy.round(2)*100}%')

Acurácia do modelo de bagging: 48.0%


### Boosting

In [7]:
pipeline = Pipeline([
    ('countvectorizer', CountVectorizer()),
    ('boosting_model', AdaBoostClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Acurácia do modelo de boosting: {accuracy.round(2)*100}%')

Acurácia do modelo de boosting: 33.0%


### Stacking

In [8]:
model_rf = RandomForestClassifier(random_state=42)
model_lr = LogisticRegression(random_state=42)
model_nb = MultinomialNB()

pipeline = Pipeline([
    ('countvectorizer', CountVectorizer()),
    ('stacking_model', StackingClassifier(estimators=[('random_forest', model_rf),
                               ('logistic_regression', model_lr),
                               ('naive_bayes', model_nb)],
                               final_estimator=RandomForestClassifier(random_state=42)))
])

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Acurácia do modelo de stacking: {accuracy.round(1)*100}%')

Acurácia do modelo de stacking: 60.0%
