In [4]:
import pandas as pd

# Column names from UCI documentation
columns = [
    "id", "diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

# Load .data file
df = pd.read_csv("wdbc.data.txt", header=None, names=columns)

# Save as CSV
df.to_csv("data.csv", index=False)

df.head()



Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
df.shape

(569, 32)

In [8]:
import argparse
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)




In [10]:
df = pd.read_csv("data.csv")

# Drop ID column
df.drop("id", axis=1, inplace=True)

# Encode target
df["diagnosis"] = df["diagnosis"].map({"M": 1, "B": 0})

X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

df.head()


Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [12]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)


In [14]:
train_pred_log = log_model.predict(X_train_scaled)
test_pred_log = log_model.predict(X_test_scaled)


In [15]:
train_error_log = 1 - accuracy_score(y_train, train_pred_log)
test_error_log = 1 - accuracy_score(y_test, test_pred_log)

print("Logistic Regression")
print("Train Error:", train_error_log)
print("Test Error :", test_error_log)
print("Generalization Gap:", test_error_log - train_error_log)

print("\nAccuracy :", accuracy_score(y_test, test_pred_log))
print("Precision:", precision_score(y_test, test_pred_log))
print("Recall   :", recall_score(y_test, test_pred_log))
print("F1-score :", f1_score(y_test, test_pred_log))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, test_pred_log))


Logistic Regression
Train Error: 0.01318681318681314
Test Error : 0.03508771929824561
Generalization Gap: 0.021900906111432472

Accuracy : 0.9649122807017544
Precision: 0.975
Recall   : 0.9285714285714286
F1-score : 0.9512195121951219

Confusion Matrix:
[[71  1]
 [ 3 39]]


In [16]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)



In [17]:
train_pred_tree = tree_model.predict(X_train)
test_pred_tree = tree_model.predict(X_test)


In [18]:
train_error_tree = 1 - accuracy_score(y_train, train_pred_tree)
test_error_tree = 1 - accuracy_score(y_test, test_pred_tree)

print("\nDecision Tree")
print("Train Error:", train_error_tree)
print("Test Error :", test_error_tree)
print("Generalization Gap:", test_error_tree - train_error_tree)

print("\nAccuracy :", accuracy_score(y_test, test_pred_tree))
print("Precision:", precision_score(y_test, test_pred_tree))
print("Recall   :", recall_score(y_test, test_pred_tree))
print("F1-score :", f1_score(y_test, test_pred_tree))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, test_pred_tree))



Decision Tree
Train Error: 0.0
Test Error : 0.07017543859649122
Generalization Gap: 0.07017543859649122

Accuracy : 0.9298245614035088
Precision: 0.9047619047619048
Recall   : 0.9047619047619048
F1-score : 0.9047619047619048

Confusion Matrix:
[[68  4]
 [ 4 38]]
