In [502]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import numpy as np
from crepes import WrapClassifier
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import log_loss, brier_score_loss, accuracy_score
from sklearn.calibration import calibration_curve

In [469]:
# Gerar um conjunto de dados de classificação
X, y = make_classification(n_samples=100000, n_features=4)

In [470]:
# Dividir os dados em conjuntos de treinamento, calibração e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_calib, y_train, y_calib = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Utilizando técnico da OOB Sample para Modelo Ensemble

In [471]:
rf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=42)
clf = WrapClassifier(rf)
clf.fit(X_train, y_train)
clf.calibrate(X_train, y_train, oob=True)

WrapClassifier(learner=RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=42), calibrated=True, predictor=ConformalClassifier(fitted=True, mondrian=False))

Avaliar o melhor nível de confiança com menor taxa de erro

In [472]:
error_rate = {k: {} for k in [0.45, 0.40, 0.35, 0.30, 0.25, 0.20, 0.15, 0.10, 0.05]}
for error in error_rate:
    predict_set = clf.predict_set(X_test, confidence = 1 - error)
    error_rate[error]["efficiency"] = np.sum([np.sum(p) == 1 for p in predict_set]) / len(predict_set)
    error_rate[error]["validity"] = np.sum(predict_set) / len(predict_set)

In [473]:
def custom_legend(fig, nameSwap):
        for i, dat in enumerate(fig.data):
            for elem in dat:
                if elem == "name":
                    fig.data[i].name = nameSwap[fig.data[i].name]
        return fig

In [474]:
def custom_facet_title(fig, titles):
    for i, label in enumerate(titles):
        fig.layout.annotations[i]['text'] = label
    return fig

In [539]:
# Create a DataFrame from the data

df = pd.DataFrame(error_rate).T

# Create the bar chart
fig = px.line(df, x=df.index, y=['efficiency', 'validity'],
             labels={'value': 'Escore'},
             markers=True,
             title='Relação Eficiência & Solidez versus Taxa de Erro',
             color_discrete_sequence=['darkblue', 'orange'],
             width=800,
             height=400)
fig.update_layout(hovermode="x")
fig.update_traces(hovertemplate="%{y}")
custom_legend(
        fig=fig, nameSwap={"efficiency": "Eficiência", "validity": "Solidez"}
    )
fig.update_layout(legend=dict(
    title="Métrica"))
fig.update_yaxes(title_text="Escore")
fig.update_xaxes(title_text="Taxa de Erro")

Melhor taxa de erro com a maior eficiência

In [523]:
confidence_level = 1 - df["efficiency"].idxmax()

In [375]:
"""
A lower validity (AvgC) value signifies that the model is better at producing more specific and informative
predictions. 

A higher efficiency (OneC) value indicates that the conformal prediction model produces specific and informative
predictions more efficiently. 

Researchers have determined that the most effective approach is to use a margin-based nonconformity
function to achieve a high rate of singleton predictions (OneC).

"""


'\nA lower validity (AvgC) value signifies that the model is better at producing more specific and informative\npredictions. \n\nA higher efficiency (OneC) value indicates that the conformal prediction model produces specific and informative\npredictions more efficiently. \n\nResearchers have determined that the most effective approach is to use a margin-based nonconformity\nfunction to achieve a high rate of singleton predictions (OneC).\n\n'

In [376]:
#Outputs
#Não há confiança para nenhum dos labels. [0, 0]
#Há confiança de apenas um dos labels. [0, 1]; [1, 0]
#Há confiança em ambos os labels. [1, 1]; [1, 1]

In [536]:
y_prob = clf.predict_set(X_test, confidence=confidence_level)[:,1]

In [537]:
accuracy_score(y_test, y_prob)

0.9118