**Classificação base de cancêr de mama**

**Guilherme Henrique Pereira Serafini - 2021.1.08.048**


**Vinícius Eduardo de Souza Honório - 2021.1.08.024**

In [None]:
# importar os pacotes necessários
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from collections import Counter

# Carregar os dados
data = pd.read_csv('https://raw.githubusercontent.com/megaVE/LearningVerification2/main/wdbc.csv')
columns = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
data.columns = columns


data = data.dropna()

X = data.drop(['ID', 'Diagnosis'], axis=1)
y = data['Diagnosis']

# padronizar as colunas numéricas
X = StandardScaler().fit_transform(X)

# label encoder na variável alvo
y = LabelEncoder().fit_transform(y)

# dividir o dataset entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Dividir o conjunto de treinamento em 4 subconjuntos
X_train_subconjuntos = []
y_train_subconjuntos = []
for _ in range(4):
    X_subconjunto, _, y_subconjunto, _ = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    X_train_subconjuntos.append(X_subconjunto)
    y_train_subconjuntos.append(y_subconjunto)

###################################################################################################
random_forest_params_variations = []
while len(random_forest_params_variations) < 15:
    new_params_forest = {
      'n_estimators': np.random.choice([100, 300, 500]),
      'max_depth': np.random.choice([None, 5, 10]),
      'min_samples_split': np.random.choice([2, 5, 10])
    }
    if new_params_forest not in random_forest_params_variations:
        random_forest_params_variations.append(new_params_forest)


random_forest_modelos = []
for variation in random_forest_params_variations:
  random_forest_model = RandomForestClassifier(**variation)
  random_forest_model1 = RandomForestClassifier(**variation)
  random_forest_model2 = RandomForestClassifier(**variation)
  random_forest_model3 = RandomForestClassifier(**variation)
  random_forest_model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
  random_forest_model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
  random_forest_model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
  random_forest_model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])
  random_forest_modelos.append(random_forest_model)
  random_forest_modelos.append(random_forest_model1)
  random_forest_modelos.append(random_forest_model2)
  random_forest_modelos.append(random_forest_model3)

xgb_params_variations = []
while len(xgb_params_variations) < 15:
    new_params_xgb = {
        'n_estimators': np.random.choice([50, 100, 200]),
        'learning_rate': np.random.choice([0.1, 0.01, 0.001]),
        'max_depth': np.random.choice([3, 5, 7, 9])
    }
    if new_params_xgb not in xgb_params_variations:
        xgb_params_variations.append(new_params_xgb)


xgb_modelos = []
for variation in xgb_params_variations:
    model = XGBClassifier(**variation)
    model1 = XGBClassifier(**variation)
    model2 = XGBClassifier(**variation)
    model3 = XGBClassifier(**variation)

    model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
    model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
    model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
    model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])

    xgb_modelos.append(model)
    xgb_modelos.append(model1)
    xgb_modelos.append(model2)
    xgb_modelos.append(model3)

svm_params_variations = []
while len(svm_params_variations) < 15:
    svm_params = {
        'C': np.random.choice([0.1, 1.0, 10.0]),
        'kernel': np.random.choice(['linear', 'rbf', 'poly']),
        'degree': np.random.choice([2, 3, 4]),
    }
    if svm_params not in svm_params_variations:
        svm_params_variations.append(svm_params)

svm_modelos = []
for variation in svm_params_variations:
    model = SVC(**variation)
    model1 = SVC(**variation)
    model2 = SVC(**variation)
    model3 = SVC(**variation)

    model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
    model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
    model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
    model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])

    svm_modelos.append(model)
    svm_modelos.append(model1)
    svm_modelos.append(model2)
    svm_modelos.append(model3)


decision_tree_params_variations = []
while len(decision_tree_params_variations) < 15:
    decision_tree_params = {
      'max_depth': np.random.choice([None, 5, 10]),
      'min_samples_split': np.random.choice([2, 5, 10]),
      'min_samples_leaf': np.random.choice([1, 2, 4])
    }
    if decision_tree_params not in decision_tree_params_variations:
        decision_tree_params_variations.append(decision_tree_params)

decision_tree_modelos = []
for variation in decision_tree_params_variations:
    model = DecisionTreeClassifier(**variation)
    model1 = DecisionTreeClassifier(**variation)
    model2 = DecisionTreeClassifier(**variation)
    model3 = DecisionTreeClassifier(**variation)

    model.fit(X_train_subconjuntos[0], y_train_subconjuntos[0])
    model1.fit(X_train_subconjuntos[1], y_train_subconjuntos[1])
    model2.fit(X_train_subconjuntos[2], y_train_subconjuntos[2])
    model3.fit(X_train_subconjuntos[3], y_train_subconjuntos[3])

    decision_tree_modelos.append(model)
    decision_tree_modelos.append(model1)
    decision_tree_modelos.append(model2)
    decision_tree_modelos.append(model3)

# print(len(random_forest_modelos))
# print(len(xgb_modelos))
# print(len(svm_regression_modelos))
# print(len(decision_tree_modelos))



model_accuracies = []
# Loop para aplicar cada modelo aos dados de teste e obter a acurácia
for model in random_forest_modelos + xgb_modelos + svm_modelos + decision_tree_modelos:
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    model_accuracies.append(accuracy)

top_indices = sorted(range(len(model_accuracies)), key=lambda i: model_accuracies[i], reverse=True)[:24]

modelos_melhores = []

modelos_lista = []

for i in top_indices:
    if i < len(random_forest_modelos):
        model_name = 'Foresta aleatoria'
        model = random_forest_modelos[i]
    elif i < len(random_forest_modelos) + len(xgb_modelos):
        model_name = 'Xgb'
        model = xgb_modelos[i - len(random_forest_modelos)]
    elif i < len(random_forest_modelos) + len(xgb_modelos) + len(svm_modelos):
        model_name = 'SVM'
        model = decision_tree_modelos[i - len(random_forest_modelos) - len(xgb_modelos)]
    else:
        model_name = 'Arvore decisão'
        model = decision_tree_modelos[i - len(random_forest_modelos) - len(xgb_modelos) - len(svm_modelos)]

    modelos_melhores.append((model_name, model_accuracies[i]))
    modelos_lista.append(model)

#printa os modelos
print("Modelos selecionados:")
for model_name, accuracy in modelos_melhores:
    print(f"{model_name} : {accuracy:.4f}")






Modelos selecionados:
SVM : 0.9766
SVM : 0.9766
SVM : 0.9766
SVM : 0.9766
SVM : 0.9766
SVM : 0.9766
SVM : 0.9766
SVM : 0.9766
SVM : 0.9708
SVM : 0.9708
SVM : 0.9708
SVM : 0.9708
SVM : 0.9708
SVM : 0.9708
SVM : 0.9708
SVM : 0.9708
SVM : 0.9649
SVM : 0.9649
SVM : 0.9649
SVM : 0.9649
SVM : 0.9649
SVM : 0.9649
SVM : 0.9649
SVM : 0.9649


In [None]:
from scipy.stats import mode

# Supondo que você tem 24 modelos em um vetor chamado modelos_lista
# E X_teste é o grupo de teste em que deseja fazer previsões
# E y_teste é o vetor de valores reais correspondentes ao grupo de teste X_teste

# Inicializar uma matriz para armazenar as previsões de cada modelo
previsoes_modelos = np.zeros((len(modelos_lista), len(X_test)))

# Fazer previsões para cada modelo em X_teste
for i, modelo in enumerate(modelos_lista):
    previsoes_modelos[i] = modelo.predict(X_test)

# Calcular a moda das previsões de todos os modelos para obter a previsão final
previsao_final, _ = mode(previsoes_modelos, axis=0, keepdims=True)

# Converte a matriz resultante da moda em um array 1D
previsao_final = previsao_final.flatten()

# Agora você tem a previsão final do ensemble usando a moda para o grupo de teste X_teste

# Compare as previsões finais com os valores reais em y_teste
acuracia_ensemble = np.mean(previsao_final == y_test)
print("Acurácia do ensemble usando a moda:", acuracia_ensemble)


Acurácia do ensemble usando a moda: 0.9707602339181286
