<a href="https://colab.research.google.com/github/Harrisson3/Breast-Cancer-Dataset/blob/main/BreastCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Step 2: Load the dataset from GitHub
url = "https://raw.githubusercontent.com/Harrisson3/Breast-Cancer-Dataset/main/data.csv"
try:
    df = pd.read_csv(url)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(df.head())

# Step 3: Prepare the data
# Assuming the target column is named 'Outcome' (binary classification: 0 or 1)
X = df.drop('Outcome', axis=1)  # Features
y = df['Outcome']  # Labels

# Step 4: Check the sample and target sizes
print("\nSample size (number of rows):", X.shape[0])
print("Number of features (columns):", X.shape[1])
print("Target size (number of labels):", y.shape[0])

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

# Step 6: Check the training and testing set sizes
print("\nTraining set size (X_train):", X_train.shape[0])
print("Testing set size (X_test):", X_test.shape[0])

# Step 7: Create the Gaussian Naive Bayes model
nb = GaussianNB()

# Step 8: Train the model
nb.fit(X_train, y_train)

# Step 9: Predict using the trained model
y_pred = nb.predict(X_test)

# Step 10: Evaluate the model using the score method
accuracy = nb.score(X_test, y_test)
print("\nAccuracy of GaussianNB:", accuracy)

# Step 11: Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Step 12: Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 13: Visualize the Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Diabetes', 'Diabetes'],
            yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for GaussianNB')
plt.show()

# Step 14: K-Fold Cross-Validation
kfold = KFold(n_splits=10, random_state=11, shuffle=True)
scores = cross_val_score(estimator=nb, X=X, y=y, cv=kfold)
print("\nK-Fold Cross-Validation Scores (GaussianNB):", scores)
print("Mean Cross-Validation Accuracy (GaussianNB):", scores.mean())

# Step 15: Running Multiple Models to Find the Best One
# Define the models to evaluate
estimators = {
    'GaussianNB': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='lbfgs', max_iter=10000),
    'SVC': SVC(gamma='scale')
}

# Evaluate each model using K-Fold Cross-Validation
print("\nEvaluating Multiple Models:")
for model_name, model in estimators.items():
    scores = cross_val_score(estimator=model, X=X, y=y, cv=kfold)
    print(f"{model_name} - Mean Cross-Validation Accuracy: {scores.mean():.4f}")

Dataset loaded successfully!

First few rows of the dataset:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

 

KeyError: "['Outcome'] not found in axis"