In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
from ucimlrepo import fetch_ucirepo 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# Data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 

# Explicitly create a copy of y to avoid SettingWithCopyWarning
y = y.copy()

# Convert Diagnosis to binary 0 or 1
y['Diagnosis'] = y['Diagnosis'].map({'M': 1, 'B': 0})

# Combine features and targets into a single DataFrame
df = pd.concat([X, y], axis=1)

print(df.head())

In [None]:
print(df.shape)
df.isnull().sum()

In [None]:
target_column = 'Diagnosis'

X = df.drop(columns=[target_column])
y = df[target_column]


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = pd.concat([x_train, y_train], axis=1)

In [None]:

# Diagnosis (M = malignant = 1, B = benign = 0)
train_data.corr(numeric_only=True)[target_column].sort_values(ascending=False)

In [None]:
train_data.describe()

In [None]:
features = ['concave_points3',]
# filter chosen features
x_train = x_train[features]
x_test = x_test[features]

In [None]:
for f in features:
    plt.figure(figsize=(8,4))  # Create a new figure for each plot
    sns.histplot(data=train_data, x=f, kde=True)  # Plot histogram with KDE
    plt.title(f'Distribution of {f}')  # Add a title
plt.show()

In [None]:
for feature in features:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))  # Create a side-by-side layout

    # Scatter plot: Feature vs Target
    train_data.plot(kind='scatter', x=feature, y=target_column, alpha=0.4, ax=axes[0])
    axes[0].set_title(f'Scatter: {feature} vs {target_column}')

    # Box plot: Distribution of Feature
    train_data[feature].plot(kind='box', ax=axes[1])
    axes[1].set_title(f'Box Plot: {feature}')

    plt.tight_layout()  # Adjust layout for better spacing
    plt.show()

In [None]:
def remove_outliers(df, columns):
    """
    Remove outliers using the IQR method for specified columns.
    """
    for col in columns:
        Q1 = df[col].quantile(0.25)  # First quartile (25%)
        Q3 = df[col].quantile(0.75)  # Third quartile (75%)
        IQR = Q3 - Q1  # Interquartile Range

        # Define bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter the DataFrame
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df


print(train_data.shape)
train_data_clean = remove_outliers(train_data, features)
print(train_data_clean.shape)

In [None]:
# setting x_train and y_train
x_train = train_data_clean[features].to_numpy()
y_train = train_data_clean[target_column].to_numpy()
print(x_train.shape)
print(y_train.shape)

In [None]:
scaler = StandardScaler()
x_norm = scaler.fit_transform(x_train)
print(f"Peak to Peak range by column in Raw        X:{np.ptp(x_train, axis=0)}")
print(f"Peak to Peak range by column in Normalized X:{np.ptp(x_norm, axis=0)}")
x_train = x_norm.copy()

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)