<a href="https://colab.research.google.com/github/Medasimone/Files/blob/main/Module4BreastCancerWisconsin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install pandas numpy scikit-learn imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression  # Import the LogisticRegression class
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from scipy import stats

# Set visual preferences
sns.set(style="whitegrid")



In [None]:
# Load the dataset from UCI repository
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
column_names = [
    'id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
    'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean',
    'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se',
    'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se',
    'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst',
    'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
    'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst',
    'fractal_dimension_worst'
]

# Load the dataset
data = pd.read_csv(data_url, header=None, names=column_names)


In [None]:
# Prepare the data for analysis by cleaning and transforming it.
# Drop the 'id' column as it's not useful for analysis
data.drop('id', axis=1, inplace=True)

# Convert diagnosis to a binary variable: M = 1, B = 0
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0}).astype(int)

# Check the unique values in the target variable and its type
print("Unique values in diagnosis column:", data['diagnosis'].unique())
print("Data type of diagnosis column:", data['diagnosis'].dtype)
print("NaN values in diagnosis column:", data['diagnosis'].isna().sum())

# The id column is dropped since it's not relevant to the analysis.
# The diagnosis column is transformed from categorical (M/B) to binary (1/0) for modeling.
# We print unique values and data types to confirm our changes and check for missing values.

Unique values in diagnosis column: [1 0]
Data type of diagnosis column: int64
NaN values in diagnosis column: 0


In [None]:
# separate the features and target variable.
# Prepare for Resampling
X = data.drop('diagnosis', axis=1)  # Features
y = data['diagnosis']  # Target variable

# Identify numeric columns after dropping 'diagnosis'
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Check the unique values in y
print("Unique values in target variable (y):", np.unique(y))
print("Data type of target variable (y):", y.dtype)

# Ensure that y is a categorical variable with two classes
if not (y.nunique() == 2 and y.dtype == 'int'):
    raise ValueError("Target variable y is not binary or not of integer type.")


Unique values in target variable (y): [0 1]
Data type of target variable (y): int64


In [None]:
# We standardize the numeric features to improve the model's performance.
# Standardizing the numeric columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("Standardized Data:")
print(X.head())  # Show the standardized data


Standardized Data:
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0     1.097064     -2.073335        1.269934   0.984375         1.568466   
1     1.829821     -0.353632        1.685955   1.908708        -0.826962   
2     1.579888      0.456187        1.566503   1.558884         0.942210   
3    -0.768909      0.253732       -0.592687  -0.764464         3.283553   
4     1.750297     -1.151816        1.776573   1.826229         0.280372   

   compactness_mean  concavity_mean  concave_points_mean  symmetry_mean  \
0          3.283515        2.652874             2.532475       2.217515   
1         -0.487072       -0.023846             0.548144       0.001392   
2          1.052926        1.363478             2.037231       0.939685   
3          3.402909        1.915897             1.451707       2.867383   
4          0.539340        1.371011             1.428493      -0.009560   

   fractal_dimension_mean  ...  radius_worst  texture_worst  perimeter_wo

In [None]:
# Detect and summarize outliers in the dataset
# Outlier Detection
z_scores = np.abs(stats.zscore(X[numeric_cols]))
outliers_z = (z_scores > 3).sum(axis=0)

Q1 = X[numeric_cols].quantile(0.25)
Q3 = X[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((X[numeric_cols] < (Q1 - 1.5 * IQR)) | (X[numeric_cols] > (Q3 + 1.5 * IQR))).sum()

outlier_summary = pd.DataFrame({
    'Z-score Outliers': outliers_z,
    'IQR Outliers': outliers_iqr
})

print("Outlier Summary:")
print(outlier_summary)


Outlier Summary:
                         Z-score Outliers  IQR Outliers
radius_mean                             5            14
texture_mean                            4             7
perimeter_mean                          7            13
area_mean                               8            25
smoothness_mean                         5             6
compactness_mean                        9            16
concavity_mean                          9            18
concave_points_mean                     6            10
symmetry_mean                           5            15
fractal_dimension_mean                  7            15
radius_se                               7            38
texture_se                              9            20
perimeter_se                            8            38
area_se                                 6            65
smoothness_se                           7            30
compactness_se                         12            28
concavity_se                   

In [None]:
# Define a function to evaluate the model's performance.
# Model Evaluation Function
def evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    report = classification_report(y_test, predictions, output_dict=True)

    # Print the entire classification report for inspection
    print("Classification Report:")
    print(report)

    return report



In [None]:
# Resampling Techniques
try:
    # 1. Random Oversampling using SMOTE
    smote = SMOTE(random_state=42)
    X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

    # 2. Random Undersampling
    rus = RandomUnderSampler(random_state=42)
    X_resampled_rus, y_resampled_rus = rus.fit_resample(X, y)

    # 3. ADASYN
    adasyn = ADASYN(random_state=42)
    X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X, y)

    # 4. Tomek Links
    tomek = SMOTETomek(random_state=42)
    X_resampled_tomek, y_resampled_tomek = tomek.fit_resample(X, y)

    # Create Resampling Summary
    original_distribution = y.value_counts()
    resampling_summary = pd.DataFrame({
        'Technique': ['Original', 'SMOTE', 'Random Under', 'ADASYN', 'Tomek Links'],
        'Class 0': [original_distribution[0],
                    y_resampled_smote.value_counts()[0],
                    y_resampled_rus.value_counts()[0],
                    y_resampled_adasyn.value_counts()[0],
                    y_resampled_tomek.value_counts()[0]],
        'Class 1': [original_distribution[1],
                    y_resampled_smote.value_counts()[1],
                    y_resampled_rus.value_counts()[1],
                    y_resampled_adasyn.value_counts()[1],
                    y_resampled_tomek.value_counts()[1]]
    })

    print("Resampling Summary:")
    print(resampling_summary)

    # Evaluation for each resampling technique
    results = {}
    results['Original'] = evaluate_model(X, y)
    results['SMOTE'] = evaluate_model(X_resampled_smote, y_resampled_smote)
    results['Random Under'] = evaluate_model(X_resampled_rus, y_resampled_rus)
    results['ADASYN'] = evaluate_model(X_resampled_adasyn, y_resampled_adasyn)
    results['Tomek Links'] = evaluate_model(X_resampled_tomek, y_resampled_tomek)

    # Safeguard against missing keys
    results_df = pd.DataFrame.from_dict({
        key: value.get('1', {'f1-score': 0}) for key, value in results.items()  # Default to 0 if key doesn't exist
    })

    print("Model Performance Results:")
    print(results_df)

    # Determine the best resampling technique based on available f1-scores
    if 'f1-score' in results_df.columns:
        best_technique = results_df['f1-score'].idxmax()
        print(f"The best resampling technique is: {best_technique}")
    else:
        print("No valid f1-scores available for comparison.")

except Exception as e:
    print("An error occurred during resampling:", str(e))


Resampling Summary:
      Technique  Class 0  Class 1
0      Original      357      212
1         SMOTE      357      357
2  Random Under      212      212
3        ADASYN      357      366
4   Tomek Links      355      355
Classification Report:
{'0': {'precision': 0.9722222222222222, 'recall': 0.9859154929577465, 'f1-score': 0.9790209790209791, 'support': 71.0}, '1': {'precision': 0.9761904761904762, 'recall': 0.9534883720930233, 'f1-score': 0.9647058823529412, 'support': 43.0}, 'accuracy': 0.9736842105263158, 'macro avg': {'precision': 0.9742063492063492, 'recall': 0.9697019325253848, 'f1-score': 0.9718634306869601, 'support': 114.0}, 'weighted avg': {'precision': 0.9737190197716513, 'recall': 0.9736842105263158, 'f1-score': 0.973621425014614, 'support': 114.0}}
Classification Report:
{'0': {'precision': 0.9852941176470589, 'recall': 0.9710144927536232, 'f1-score': 0.9781021897810219, 'support': 69.0}, '1': {'precision': 0.9733333333333334, 'recall': 0.9864864864864865, 'f1-score': 

Story Title: Boosting Breast Cancer Diagnosis
Introduction
We took a deep dive into the Breast Cancer Wisconsin dataset to enhance how we diagnose patients.

Data Preparation
Dropped the ID column because it wasn’t useful.
Converted diagnoses into simple binary values: Malignant and Benign.
Standardized numeric features to keep everything consistent.
Model Evaluation
Tackled Class Imbalance: Used techniques like SMOTE and ADASYN to balance the dataset.
Trained a logistic regression model and checked how well it performed with:
Classification Reports showing accuracy and precision.
Confusion Matrices for visualizing predictions.
ROC Curves to see how well the model distinguished between classes.
Key Insights from SHAP
Key features: Found that 'radius_mean' and 'perimeter_mean' really affect the diagnosis.
Personalized follow-ups: This helps us tailor follow-ups based on individual risk factors.
Conclusion
In short, our analysis gives us valuable insights to improve breast cancer diagnoses and enhance patient care.