In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

#Import and read dataset
csv_path = '~/Desktop/breast-cancer.csv'
df = pd.read_csv(csv_path)
print(f"Loaded CSV from {csv_path}")

Loaded CSV from ~/Desktop/breast-cancer.csv


In [None]:
#Exploratory analysis
print("First 12 rows:")
print(df.head(12))

print("\nLast 5 rows:")
print(df.tail(5))

print(f"\nDataset shape: {df.shape[0]} rows, {df.shape[1]} columns")

First 12 rows:
          ea diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0     842302         M        17.99         10.38          122.80     1001.0   
1     842517         M        20.57         17.77          132.90     1326.0   
2   84300903         M        19.69         21.25          130.00     1203.0   
3   84348301         M        11.42         20.38           77.58      386.1   
4   84358402         M        20.29         14.34          135.10     1297.0   
5     843786         M        12.45         15.70           82.57      477.1   
6     844359         M        18.25         19.98          119.60     1040.0   
7   84458202         M        13.71         20.83           90.20      577.9   
8     844981         M        13.00         21.82           87.50      519.8   
9   84501001         M        12.46         24.04           83.97      475.9   
10    845636         M        16.02         23.24          102.70      797.8   
11  84610002         M   

In [None]:
#Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
ea                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [None]:
#Prepare data for model
X = df.drop(columns=['id', 'diagnosis'], errors='ignore')
Y = df['diagnosis'].map({'B': 0, 'M': 1})

In [None]:
#Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42
)

In [None]:
#Train Logistic Regression model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, Y_train)
print("Model training completed.\n")

Model training completed.



In [None]:
#Predictions and evaluation
y_pred = model.predict(X_test)
print("First 10 Actual vs Predicted:")
for actual, pred in zip(Y_test.values[:10], y_pred[:10]):
    print(f"Actual: {actual}, Predicted: {pred}")

First 10 Actual vs Predicted:
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0


In [12]:
#Confusion Matrix
cm = confusion_matrix(Y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[87  2]
 [ 5 49]]


In [None]:
#Precision, Recall, F1 Score
print(f"\nPrecision: {precision_score(Y_test, y_pred):.2f}")
print(f"Recall:    {recall_score(Y_test, y_pred):.2f}")
print(f"F1 Score:  {f1_score(Y_test, y_pred):.2f}\n")


Precision: 0.96
Recall:    0.91
F1 Score:  0.93



In [14]:
#Conclusions
learnings = [
    "Logistic Regression is suitable for binary classification tasks.",
    "Data preprocessing and handling of missing values are crucial.",
    "Model evaluation metrics such as precision, recall, and F1 score help assess performance beyond accuracy.",
    "Confusion matrix provides insight into types of classification errors."
]
print("Learnings and Conclusions:")
for point in learnings:
    print(f"- {point}")

Learnings and Conclusions:
- Logistic Regression is suitable for binary classification tasks.
- Data preprocessing and handling of missing values are crucial.
- Model evaluation metrics such as precision, recall, and F1 score help assess performance beyond accuracy.
- Confusion matrix provides insight into types of classification errors.


In [15]:
#Quick sanity test on a small subset
print("\n11. Quick sanity test on first 5 test samples:")
sample_X = X_test.iloc[:5]
sample_y = Y_test.iloc[:5].values
sample_pred = model.predict(sample_X)
print(f"Actual labels   : {sample_y}")
print(f"Predicted labels: {sample_pred}")


11. Quick sanity test on first 5 test samples:
Actual labels   : [0 1 1 0 0]
Predicted labels: [0 1 1 0 0]
