In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
data = pd.read_csv("Breast Cancer Wisconsin.csv")

# Display the first few rows
data.head()


In [None]:
# View dataset shape and column information
print("Shape of the dataset:", data.shape)
print("\nColumns and Data Types:")
print(data.dtypes)

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Check class distribution
print("\nTarget Distribution:")
print(data["diagnosis"].value_counts())


In [None]:
# Drop unnecessary columns
data.drop("id", axis=1, inplace=True)

# Encode the target variable: M → 1, B → 0
data["diagnosis"] = data["diagnosis"].map({"M": 1, "B": 0})


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

# Split features and target
X = data.drop("diagnosis", axis=1)
y = data["diagnosis"]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Show class distribution before and after SMOTE
print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_smote))


In [None]:
from xgboost import XGBClassifier

# Initialize and train XGBoost model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_smote, y_train_smote)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict on the test set
y_pred = xgb.predict(X_test)

# Evaluate predictions
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
