# 🧼 Robust Preprocessing: Dirty Level 3 Breast Cancer Dataset

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler


In [2]:
# Load the Level 3 dirty dataset
df = pd.read_csv("../data/breast_cancer_synthetic_3k_dirty_L3.csv", dtype=str)  # Read everything as string
print(f"Shape before cleanup: {df.shape}")
df.head()


Shape before cleanup: (3307, 31)


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,0.4703468978929246,0.7255915997887855,0.3837778172086139,0.541612166732899,0.5730069540665756,0.4935486409535736,0.1643760078654825,0.5633432592324995,0.4337572464707586,0.4765450595262173,...,0.6501471206420156,0.5760290745439467,0.3222762552745122,0.4021919150795119,0.5140546005637269,0.4536155509833091,0.4819116800890861,0.5531158781694312,0.5867705668567242,Benign
1,0.6695648977072878,0.4392072605102756,0.3941786668369811,0.608503722848196,0.4943306799776134,0.7006390068270365,0.4695413344302305,0.4428759460793881,0.522651708882903,0.516114411005247,...,0.439941132808777,0.6864782755860055,0.7242802788351862,0.6259257472129063,0.4268218439918287,0.3953129674602478,0.4096966872307536,0.1957696391002293,0.6548813808322008,Benign
2,0.6631911952713736,0.6628266617107644,0.638128982705975,0.4618775333556016,0.6461110988342409,0.45927682462002,0.3414932994859551,,0.4372073989197934,0.4112406527813312,...,0.5027334481672825,0.4901995764267468,0.5822552837757374,0.6739127297140863,0.505416391762036,0.435800714092224,0.407686704831068,0.5351414940890212,0.482041957542043,Benign
3,0.6930959779574402,0.0,1.9844990940123293,0.0,0.4206284099398661,1.8768496723636992,,,0.0,0.0,...,2.102877167826429,2.3991784131995124,2.653501173354345,-0.5864482321616343,-0.4249288522599763,1.9191131354445647,0.0,0.0,0.0,Malignant
4,,0.0,1.809615440638546,0.0,0.5684695382911445,2.168267956912806,-0.4960366346200739,-0.5188244235107756,0.0,0.0,...,1.1679963396612454,1.873785626672625,2.872787523696101,-0.7937536161373842,-0.6142879632492773,1.637869770855248,0.0,0.0,0.0,Malignant


In [3]:
# Normalize target labels
label_map = {
    "1": 1, "Malignant": 1, "M": 1,
    "0": 0, "Zero": 0, "Benign": 0
}
df['diagnosis'] = df['diagnosis'].str.strip().map(label_map)


In [4]:
# Remove weird formats and convert to numeric
for col in df.columns:
    if col == "diagnosis":
        continue
    df[col] = df[col].str.replace(',', '.', regex=False)
    df[col] = df[col].str.replace('--', '', regex=False)
    df[col] = df[col].str.strip()
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [5]:
# Drop duplicates
df = df.drop_duplicates()


In [6]:
# Impute missing values with median
df = df.fillna(df.median(numeric_only=True))


In [7]:
# Final check
print("Cleaned dataset preview:")
print(df.isnull().sum())
print(df['diagnosis'].value_counts())
df.head()


Cleaned dataset preview:
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
diagnosis                  0
dtype: int64
diagnosis
0    1817
1    1468
Name: count, dtype: int64


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,0.470347,0.725592,0.383778,0.541612,0.573007,0.493549,0.164376,0.563343,0.433757,0.476545,...,0.650147,0.576029,0.322276,0.402192,0.514055,0.453616,0.481912,0.553116,0.586771,0
1,0.669565,0.439207,0.394179,0.608504,0.494331,0.700639,0.469541,0.442876,0.522652,0.516114,...,0.439941,0.686478,0.72428,0.625926,0.426822,0.395313,0.409697,0.19577,0.654881,0
2,0.663191,0.662827,0.638129,0.461878,0.646111,0.459277,0.341493,0.488127,0.437207,0.411241,...,0.502733,0.4902,0.582255,0.673913,0.505416,0.435801,0.407687,0.535141,0.482042,0
3,0.693096,0.0,1.984499,0.0,0.420628,1.87685,0.463585,0.488127,0.0,0.0,...,2.102877,2.399178,2.653501,-0.586448,-0.424929,1.919113,0.0,0.0,0.0,1
4,0.617036,0.0,1.809615,0.0,0.56847,2.168268,-0.496037,-0.518824,0.0,0.0,...,1.167996,1.873786,2.872788,-0.793754,-0.614288,1.63787,0.0,0.0,0.0,1


In [8]:
# Separate features and target
X = df.drop(columns=['diagnosis'])
y = df['diagnosis'].astype(int)

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Combine
df_cleaned = pd.DataFrame(X_scaled, columns=X.columns)
df_cleaned['diagnosis'] = y.values

# Save
df_cleaned.to_csv("../data/breast_cancer_synthetic_3k_cleaned_from_L3.csv", index=False)
print("✅ Saved cleaned version as 'breast_cancer_synthetic_3k_cleaned_from_L3.csv'")


✅ Saved cleaned version as 'breast_cancer_synthetic_3k_cleaned_from_L3.csv'
