Importation of bookstores

In [None]:
# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


In [None]:
# Setup: Add project root to Python path to allow imports from src/
import sys
import os

# Add the parent directory of the current working directory to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom modules
from src.load_data import load_breast_cancer_data


Loading the dataset

In [None]:
# Load data
df = load_breast_cancer_data()

# Preview
df.head()


Dimensions and data types

In [None]:
# Shape
print(f"Dataset shape: {df.shape}")

# Data types and non-null values
df.info()


Descriptive statistics

In [None]:
# Descriptive statistics for numeric columns
df.describe()


Check for missing values

In [None]:
# Check for missing values
df.isnull().sum()


In [None]:
# Visualize class distribution (Benign vs Malignant)
import seaborn as sns
import matplotlib.pyplot as plt

# Set plot style
sns.set(style="whitegrid")

# Plot class counts
plt.figure(figsize=(6, 4))
sns.countplot(x="Diagnosis", data=df, palette="pastel")
plt.title("Class Distribution (Benign vs Malignant)")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.show()


In [None]:
# Calculate and display class proportions
class_counts = df["Diagnosis"].value_counts()
class_percentages = df["Diagnosis"].value_counts(normalize=True) * 100

print("Class Counts:\n", class_counts)
print("\nClass Percentages:\n", class_percentages.round(2))


Feature Distribution by Diagnosis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define relevant features to inspect
features_to_plot = [
    "Radius1",       # radius_mean
    "Texture1",      # texture_mean
    "Area1",         # area_mean
    "Concavity1",    # concavity_mean
    "Compactness1"   # compactness_mean
]

# Set plot style
sns.set(style="whitegrid")

# Plot each feature as a KDE by diagnosis
for feature in features_to_plot:
    plt.figure(figsize=(7, 4))
    sns.kdeplot(data=df, x=feature, hue="Diagnosis", fill=True, common_norm=False, palette=["#FF6F61", "#6BAED6"])
    plt.title(f"Distribution of {feature} by Diagnosis")
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.tight_layout()
    plt.show()


We examined the distribution of selected features (e.g., radius, texture, concavity) separated by diagnosis class (Malignant vs Benign). For malignant tumors, the feature values tend to have a wider spread and are generally shifted toward higher values compared to benign tumors. This indicates that malignant cells often exhibit more extreme morphological characteristics. The separation between classes is visually noticeable in several features, which could be useful for model discrimination.


Correlation Matrix

In [None]:
# Correlation Matrix and heatmap

import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix 
correlation_matrix = df.corr(numeric_only=True)

# size of the figure
plt.figure(figsize=(18, 16))

# Create the heatmap
sns.heatmap(
    correlation_matrix, 
    annot=False,        # change to True if you want to show values
    cmap="coolwarm", 
    linewidths=0.5,
    cbar_kws={"shrink": 0.5}
)

plt.title("Correlation Matrix of Breast Cancer Features", fontsize=16)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


A full feature correlation matrix reveals strong linear relationships between several groups of variables. For example, `radius`, `perimeter`, and `area` (across all three measurements) show high positive correlations, suggesting potential multicollinearity. This is expected, as these features are geometrically related. Identifying such patterns helps in feature selection and dimensionality reduction if needed.


Features correlation with Diagnosis

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Copy DataFrame and encode Diagnosis (Malignant=1, Benign=0)
df_encoded = df.copy()
df_encoded["Diagnosis"] = df_encoded["Diagnosis"].map({"M": 1, "B": 0})

# Compute correlation matrix
correlation_matrix = df_encoded.corr(numeric_only=True)

# Get correlations with 'Diagnosis' and sort them
corr_with_target = correlation_matrix["Diagnosis"].drop("Diagnosis").sort_values(ascending=False)

# Plot top and bottom correlations
plt.figure(figsize=(10, 6))
sns.barplot(x=corr_with_target.values, y=corr_with_target.index, palette="coolwarm")
plt.title("Feature Correlation with Diagnosis (Malignant = 1, Benign = 0)")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Features")
plt.tight_layout()
plt.show()


The correlation of individual features with the diagnosis target (Malignant = 1, Benign = 0) highlights key predictors. Features like `concave points`, `perimeter`, and `radius` (especially their "worst" values) exhibit strong positive correlations (r > 0.7) with malignancy. Conversely, features like `smoothness` and `fractal dimension` show weak or negative correlations. These insights support prioritizing highly correlated features in early model iterations or feature importance analysis.
