# Questão 1

In [1]:
from scipy import stats
import numpy as np

def t_corrigido_nadeau_bengio(data1, data2, X, n_folds_externos):

    N = len(X)  # número total de amostras no dataset
    n = len(data1)  # número de execuções (ex: 3 repetições × 10 folds = 30)

    # Estimativa dos tamanhos dos conjuntos de treino/teste em cada fold externo
    n_test = N // n_folds_externos
    n_train = N - n_test

    # Cálculo da estatística t com correção
    diffs = np.array(data1) - np.array(data2)
    mean_diff = np.mean(diffs)
    std_diff = np.std(diffs, ddof=1)

    se_corrigido = std_diff * np.sqrt(1/n + n_test/n_train)
    t_stat = mean_diff / se_corrigido
    p_valor = 2 * (1 - t.cdf(abs(t_stat), df=n - 1))

    return t_stat, p_valor

def classification_report(scores):
    print(f'Media: {scores.mean():.2f}, Desvio Padrao: {scores.std():.2f}')
    inf, sup = stats.norm.interval(0.95, loc=scores.mean(), scale=scores.std()/np.sqrt(len(scores)))
    print(f'Intervalo de confiança (95%): [{inf:.2f}, {sup:.2f}]')

In [2]:
from scipy import stats
from scipy.stats import t
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import datasets

breast = datasets.load_breast_cancer()
breast_X = breast.data
breast_y = breast.target

gNB = GaussianNB()

rkf = RepeatedStratifiedKFold(n_splits=5, n_repeats=6)

scores = cross_val_score(gNB, breast_X, breast_y, scoring='accuracy', cv=rkf)

print(scores)
classification_report(scores)

[0.92982456 0.92105263 0.93859649 0.95614035 0.95575221 0.96491228
 0.92105263 0.98245614 0.93859649 0.90265487 0.94736842 0.93859649
 0.92982456 0.92982456 0.95575221 0.94736842 0.92105263 0.95614035
 0.92105263 0.95575221 0.95614035 0.94736842 0.92982456 0.85087719
 0.99115044 0.95614035 0.97368421 0.88596491 0.94736842 0.92920354]
Media: 0.94, Desvio Padrao: 0.03
Intervalo de confiança (95%): [0.93, 0.95]


# Questão 2

In [3]:
from sklearn.neural_network import MLPClassifier

breast = datasets.load_breast_cancer()
breast_X = breast.data
breast_y = breast.target

mlp = MLPClassifier()
rkf = RepeatedStratifiedKFold(n_splits=5, n_repeats=6)
scores = cross_val_score(mlp, breast_X, breast_y, scoring='accuracy', cv=rkf)

print(scores)
classification_report(scores)



[0.92105263 0.92105263 0.9122807  0.96491228 0.97345133 0.93859649
 0.9122807  0.92982456 0.89473684 0.9380531  0.95614035 0.93859649
 0.93859649 0.93859649 0.90265487 0.98245614 0.9122807  0.93859649
 0.9122807  0.95575221 0.92982456 0.92982456 0.92105263 0.92982456
 0.83185841 0.92105263 0.92982456 0.93859649 0.95614035 0.92920354]
Media: 0.93, Desvio Padrao: 0.03
Intervalo de confiança (95%): [0.92, 0.94]


# Questão 3

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import datasets

breast = datasets.load_breast_cancer()
breast_X = breast.data
breast_y = breast.target

scalar = StandardScaler()
rkf = RepeatedStratifiedKFold(n_splits=5, n_repeats=6)
rF = RandomForestClassifier()

pipeline = Pipeline([('transformer', scalar), ('estimator', rF)])
grade={'estimator__n_estimators': [10, 20, 50, 100]}
gs = GridSearchCV(estimator=pipeline, param_grid = grade, scoring='accuracy', cv=4)

scores = cross_val_score(gs, breast_X, breast_y, scoring='accuracy', cv=rkf)

print(scores)
classification_report(scores)

[0.94736842 0.92105263 0.97368421 0.96491228 0.98230088 0.96491228
 0.99122807 0.92982456 0.96491228 0.96460177 0.93859649 0.97368421
 0.95614035 0.99122807 0.97345133 0.95614035 0.96491228 0.94736842
 0.94736842 0.98230088 0.96491228 0.95614035 0.95614035 0.94736842
 0.96460177 0.95614035 0.99122807 0.98245614 0.95614035 0.92035398]
Media: 0.96, Desvio Padrao: 0.02
Intervalo de confiança (95%): [0.95, 0.97]


# Questão 4

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import datasets

wine = datasets.load_wine()
wine_X = wine.data
wine_Y = wine.target

scalar = StandardScaler()
parameters = {'bagging__n_estimators': [10,25,50]}
bg = BaggingClassifier(estimator=GaussianNB(), random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), ('bagging', bg)])
clf = GridSearchCV(pipe, parameters, cv=4)
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

scores1 = cross_val_score(clf, wine_X, wine_Y, cv=rkf)
classification_report(scores1)

parameters = {'boosting__n_estimators': [10,25,50]}

adb = AdaBoostClassifier(estimator=GaussianNB(), random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), ('boosting', adb)])
clf = GridSearchCV(pipe, parameters, cv=4)
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)

scores2 = cross_val_score(clf, wine_X, wine_Y, cv=rkf)
classification_report(scores2)

print("Teste t corrigido")
s, p = t_corrigido_nadeau_bengio(scores1, scores2, wine_X, 10)
print("t: %0.2f p-value: %0.2f\n" % (s, p))
if p < 0.05:
    print(f"➡️ Diferença significativa a 95% de confiança.")
else:
    print(f"➡️ Diferença NÃO significativa a 95% de confiança.")

Media: 0.98, Desvio Padrao: 0.03
Intervalo de confiança (95%): [0.96, 0.99]
Media: 0.97, Desvio Padrao: 0.04
Intervalo de confiança (95%): [0.96, 0.99]
Teste t corrigido
t: 0.10 p-value: 0.92

➡️ Diferença NÃO significativa a 95% de confiança.
