In [49]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/124.9 MB 15.7 MB/s eta 0:00:08
    --------------------------------------- 2.0/124.9 MB 25.2 MB/s eta 0:00:05
   - -------------------------------------- 5.4/124.9 MB 34.3 MB/s eta 0:00:04
   -- ------------------------------------- 7.6/124.9 MB 37.4 MB/s eta 0:00:04
   --- ------------------------------------ 11.1/124.9 MB 50.4 MB/s eta 0:00:03
   --- ------------------------------------ 12.5/124.9 MB 50.4 MB/s eta 0:00:03
   ---- ----------------------------------- 14.4/124.9 MB 50.4 MB/s eta 0:00:03
   ----- ---------------------------------- 15.7/124.9 MB 43.7 MB/s eta 0:00:03
   ----- ---------------------------------- 16.1/124.9 MB 43.5 MB/s eta 0:00:03
   ----- ---------------------------------- 16.1/124.9 MB 43.5 MB

In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from ucimlrepo import fetch_ucirepo 

In [75]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_original.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_original.variables) 


{'uci_id': 15, 'name': 'Breast Cancer Wisconsin (Original)', 'repository_url': 'https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original', 'data_url': 'https://archive.ics.uci.edu/static/public/15/data.csv', 'abstract': 'Original Wisconsin Breast Cancer Database', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 699, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': ['Sample_code_number'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1990, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C5HP4Z', 'creators': ['WIlliam Wolberg'], 'intro_paper': None, 'additional_info': {'summary': "Samples arrive periodically as Dr. Wolberg reports his clinical cases. The database therefore reflects this chronological grouping of the data. This grouping information appears immediately below, having been removed fro

In [79]:
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)

In [81]:
X = breast_cancer_wisconsin_original.data.features
y = breast_cancer_wisconsin_original.data.targets

In [83]:
print("First few rows of dataset:")
print(X.head())

First few rows of dataset:
   Clump_thickness  Uniformity_of_cell_size  Uniformity_of_cell_shape  \
0                5                        1                         1   
1                5                        4                         4   
2                3                        1                         1   
3                6                        8                         8   
4                4                        1                         1   

   Marginal_adhesion  Single_epithelial_cell_size  Bare_nuclei  \
0                  1                            2          1.0   
1                  5                            7         10.0   
2                  1                            2          2.0   
3                  1                            3          4.0   
4                  3                            2          1.0   

   Bland_chromatin  Normal_nucleoli  Mitoses  
0                3                1        1  
1                3                2        

In [85]:
y = np.where(y == 2, 0, 1)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [89]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [109]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors (k=5)": KNeighborsClassifier(n_neighbors=5),
    "Linear SVM": SVC(kernel="linear"),
    "Kernel SVM (RBF)": SVC(kernel="rbf"),
    "Naïve Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest (n_estimators=10)": RandomForestClassifier(n_estimators=10),
    "XGBoost": xgb.XGBClassifier(eval_metric="logloss", verbosity=0)
}

In [112]:
X_train = pd.DataFrame(X_train).dropna().values
X_test = pd.DataFrame(X_test).dropna().values
y_train = y_train[:X_train.shape[0]]  
y_test = y_test[:X_test.shape[0]]


In [114]:
for name, model in models.items():
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test) 

    conf_matrix = confusion_matrix(y_test, y_pred)

    TN, FP, FN, TP = conf_matrix.ravel()  
    
    manual_accuracy = (TP + TN) / (TP + TN + FP + FN)

    print(f"\n{name} Model:")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"True Positives (TP): {TP}")
    print(f"False Positives (FP): {FP}")
    print(f"False Negatives (FN): {FN}")
    print(f"True Negatives (TN): {TN}")
    print(f"Accuracy: {manual_accuracy:.4f}")



Logistic Regression Model:
Confusion Matrix:
[[104  10]
 [ 48   6]]
True Positives (TP): 6
False Positives (FP): 10
False Negatives (FN): 48
True Negatives (TN): 104
Accuracy: 0.6548

K-Nearest Neighbors (k=5) Model:
Confusion Matrix:
[[92 22]
 [48  6]]
True Positives (TP): 6
False Positives (FP): 22
False Negatives (FN): 48
True Negatives (TN): 92
Accuracy: 0.5833

Linear SVM Model:
Confusion Matrix:
[[114   0]
 [ 54   0]]
True Positives (TP): 0
False Positives (FP): 0
False Negatives (FN): 54
True Negatives (TN): 114
Accuracy: 0.6786

Kernel SVM (RBF) Model:
Confusion Matrix:
[[108   6]
 [ 49   5]]
True Positives (TP): 5
False Positives (FP): 6
False Negatives (FN): 49
True Negatives (TN): 108
Accuracy: 0.6726

Naïve Bayes Model:
Confusion Matrix:
[[85 29]
 [40 14]]
True Positives (TP): 14
False Positives (FP): 29
False Negatives (FN): 40
True Negatives (TN): 85
Accuracy: 0.5893

Decision Tree Model:
Confusion Matrix:
[[89 25]
 [37 17]]
True Positives (TP): 17
False Positives (FP): 