In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
import numpy as np

In [6]:
data = pd.read_csv('data.csv')
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# escalonamento dos dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


## Ajustando dados

In [18]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

## Decision Tree

In [19]:
decision_tree_model = DecisionTreeClassifier(max_depth=31, criterion='entropy', min_samples_split=7)

# Treine o modelo com o conjunto de treinamento completo
decision_tree_model.fit(X_train_imputed, y_train)

# Faça previsões no conjunto de teste
y_pred = decision_tree_model.predict(X_test_imputed)

# Calcule as métricas de desempenho
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='M')
recall = recall_score(y_test, y_pred, pos_label='M')
f1 = f1_score(y_test, y_pred, pos_label='M')

# Imprima as métricas de desempenho
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.956140350877193
Precision: 0.975
Recall: 0.9069767441860465
F1-score: 0.9397590361445783


## Ada Bossting

In [22]:
ada_boost_model = AdaBoostClassifier(base_estimator=decision_tree_model)

# Definir os hiperparâmetros a serem ajustados
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01, 0.5],
    'n_estimators': [50, 100, 200],
    'base_estimator__max_depth': [3, 5, 7]
}

# Criar o objeto GridSearchCV
grid_search = GridSearchCV(ada_boost_model, param_grid, cv=5)

# Treinar o modelo com o conjunto de treinamento completo
grid_search.fit(X_train_imputed, y_train)

# Obter os melhores hiperparâmetros encontrados
best_paramsAB = grid_search.best_params_

# Obter o modelo com os melhores hiperparâmetros
best_modelAB = grid_search.best_estimator_

# Fazer previsões no conjunto de teste com o modelo ajustado
y_pred = best_modelAB.predict(X_test_imputed)

# Calcular as métricas de desempenho
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='M')
recall = recall_score(y_test, y_pred, pos_label='M')
f1 = f1_score(y_test, y_pred, pos_label='M')

# Imprimir as métricas de desempenho
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)




Accuracy: 0.956140350877193
Precision: 0.975
Recall: 0.9069767441860465
F1-score: 0.9397590361445783


## Random Forest

In [24]:
random_forest_model = RandomForestClassifier()

# Definir os hiperparâmetros a serem ajustados
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}

# Criar o objeto GridSearchCV
grid_search = GridSearchCV(random_forest_model, param_grid, cv=5)

# Treinar o modelo com o conjunto de treinamento completo
grid_search.fit(X_train_imputed, y_train)

# Obter os melhores hiperparâmetros encontrados
best_paramsRF = grid_search.best_params_

# Obter o modelo com os melhores hiperparâmetros
best_modelRF = grid_search.best_estimator_

# Fazer previsões no conjunto de teste com o modelo ajustado
y_pred = best_modelRF.predict(X_test_imputed)

# Calcular as métricas de desempenho
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='M')
recall = recall_score(y_test, y_pred, pos_label='M')
f1 = f1_score(y_test, y_pred, pos_label='M')

# Imprimir as métricas de desempenho
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9649122807017544
Precision: 0.975609756097561
Recall: 0.9302325581395349
F1-score: 0.9523809523809524


## Retreiando AdaBoosting e RandomForest com os melhores parametros

In [25]:
# Retreinar o modelo AdaBoost com os melhores parâmetros
ada_boost_model = AdaBoostClassifier(learning_rate=best_paramsAB['learning_rate'], n_estimators=best_paramsAB['n_estimators'], base_estimator=decision_tree_model)
ada_boost_model.fit(X_train_imputed, y_train)

# Retreinar o modelo RandomForest com os melhores parâmetros
random_forest_model = RandomForestClassifier(n_estimators=best_paramsRF['n_estimators'], max_depth=best_paramsRF['max_depth'])
random_forest_model.fit(X_train_imputed, y_train)

# Fazer previsões no conjunto de teste com os modelos retreinados
y_pred_ada_boost = ada_boost_model.predict(X_test_imputed)
y_pred_random_forest = random_forest_model.predict(X_test_imputed)

# Calcular as métricas de desempenho para o modelo AdaBoost
accuracy_ada_boost = accuracy_score(y_test, y_pred_ada_boost)
precision_ada_boost = precision_score(y_test, y_pred_ada_boost, pos_label='M')
recall_ada_boost = recall_score(y_test, y_pred_ada_boost, pos_label='M')
f1_ada_boost = f1_score(y_test, y_pred_ada_boost, pos_label='M')

# Calcular as métricas de desempenho para o modelo RandomForest
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
precision_random_forest = precision_score(y_test, y_pred_random_forest, pos_label='M')
recall_random_forest = recall_score(y_test, y_pred_random_forest, pos_label='M')
f1_random_forest = f1_score(y_test, y_pred_random_forest, pos_label='M')

# Imprimir as métricas de desempenho para o modelo AdaBoost
print("AdaBoost - Accuracy:", accuracy_ada_boost)
print("AdaBoost - Precision:", precision_ada_boost)
print("AdaBoost - Recall:", recall_ada_boost)
print("AdaBoost - F1-score:", f1_ada_boost)

# Imprimir as métricas de desempenho para o modelo RandomForest
print("RandomForest - Accuracy:", accuracy_random_forest)
print("RandomForest - Precision:", precision_random_forest)
print("RandomForest - Recall:", recall_random_forest)
print("RandomForest - F1-score:", f1_random_forest)



AdaBoost - Accuracy: 0.9649122807017544
AdaBoost - Precision: 0.975609756097561
AdaBoost - Recall: 0.9302325581395349
AdaBoost - F1-score: 0.9523809523809524
RandomForest - Accuracy: 0.956140350877193
RandomForest - Precision: 0.9523809523809523
RandomForest - Recall: 0.9302325581395349
RandomForest - F1-score: 0.9411764705882352


##### O melhor modelo, com base nos resultados, é o AdaBoost. Ele obteve a maior acurácia e F1-score entre os três modelos avaliados.

In [31]:
feature_importances_ada_boost = ada_boost_model.feature_importances_
sorted_indices_ada_boost = np.argsort(feature_importances_ada_boost)[::-1]

# Exibir a importância das features para o modelo AdaBoost em ordem decrescente
for i in sorted_indices_ada_boost:
    print(f"Feature {i}: {feature_importances_ada_boost[i]}")


Feature 22: 0.346016925328152
Feature 30: 0.13236473644027558
Feature 8: 0.10655052095907552
Feature 15: 0.10464466212460476
Feature 21: 0.07355370481137886
Feature 16: 0.06869582043646873
Feature 7: 0.06353876829292422
Feature 24: 0.04490350763284068
Feature 23: 0.02778063787463962
Feature 6: 0.00519746192549214
Feature 19: 0.004268215827430876
Feature 28: 0.004219225705896217
Feature 2: 0.004170397211777626
Feature 11: 0.0038910121210735536
Feature 0: 0.0037644743798528973
Feature 18: 0.003157248289991537
Feature 27: 0.0016343686787088937
Feature 12: 0.0008518045819167528
Feature 29: 0.0005360412129518573
Feature 9: 7.57135354430796e-05
Feature 13: 6.04333586778475e-05
Feature 14: 3.267225356875205e-05
Feature 20: 3.157398449651679e-05
Feature 25: 2.946867724949768e-05
Feature 5: 1.3889597441307603e-05
Feature 17: 1.0421997735814434e-05
Feature 26: 5.211759621536172e-06
Feature 4: 5.97625094371499e-07
Feature 3: 4.833752191039775e-07
Feature 1: 1.299151592959688e-19
Feature 10: 0.0


## Retreinando modelo com 10 features mais importantes

In [30]:
feature_importancesAB = best_modelAB.feature_importances_
feature_importancesRF = best_modelRF.feature_importances_

sorted_indicesAB = np.argsort(feature_importancesAB)[::-1]
sorted_indicesRF = np.argsort(feature_importancesRF)[::-1]

top_feature_indicesAB = sorted_indicesAB[:10]
top_feature_indicesRF = sorted_indicesRF[:10]

X_train_top_featuresAB = np.take(X_train, top_feature_indicesAB, axis=1)
X_test_top_featuresAB = np.take(X_test, top_feature_indicesAB, axis=1)

X_train_top_featuresRF = np.take(X_train, top_feature_indicesRF, axis=1)
X_test_top_featuresRF = np.take(X_test, top_feature_indicesRF, axis=1)

best_model_top_featuresAB = best_modelAB.fit(X_train_top_featuresAB, y_train)
best_model_top_featuresRF = best_modelRF.fit(X_train_top_featuresRF, y_train)

y_pred_top_featuresAB = best_model_top_featuresAB.predict(X_test_top_featuresAB)
y_pred_top_featuresRF = best_model_top_featuresRF.predict(X_test_top_featuresRF)

accuracy_top_featuresAB = accuracy_score(y_test, y_pred_top_featuresAB)
precision_top_featuresAB = precision_score(y_test, y_pred_top_featuresAB, pos_label='M')
recall_top_featuresAB = recall_score(y_test, y_pred_top_featuresAB, pos_label='M')
f1_top_featuresAB = f1_score(y_test, y_pred_top_featuresAB, pos_label='M')

accuracy_top_featuresRF = accuracy_score(y_test, y_pred_top_featuresRF)
precision_top_featuresRF = precision_score(y_test, y_pred_top_featuresRF, pos_label='M')
recall_top_featuresRF = recall_score(y_test, y_pred_top_featuresRF, pos_label='M')
f1_top_featuresRF = f1_score(y_test, y_pred_top_featuresRF, pos_label='M')

print("AdaBoost - Accuracy:", accuracy_top_featuresAB)
print("AdaBoost - Precision:", precision_top_featuresAB)
print("AdaBoost - Recall:", recall_top_featuresAB)
print("AdaBoost - F1-score:", f1_top_featuresAB)

print("RandomForest - Accuracy:", accuracy_top_featuresRF)
print("RandomForest - Precision:", precision_top_featuresRF)
print("RandomForest - Recall:", recall_top_featuresRF)
print("RandomForest - F1-score:", f1_top_featuresRF)



AdaBoost - Accuracy: 0.9473684210526315
AdaBoost - Precision: 0.9743589743589743
AdaBoost - Recall: 0.8837209302325582
AdaBoost - F1-score: 0.9268292682926831
RandomForest - Accuracy: 0.9473684210526315
RandomForest - Precision: 0.9302325581395349
RandomForest - Recall: 0.9302325581395349
RandomForest - F1-score: 0.9302325581395349
