In [None]:
import pandas as pd # for data manipulation
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, LabelEncoder # for data preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score 

df = pd.read_csv('../data.csv')


print(df.head())
print(df.info())
print(df.describe())
print(df['diagnosis'].value_counts())  

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [5]:
df = df.drop('id', axis=1)

In [6]:
if 'Unnamed: 32' in df.columns:
    df = df.drop(columns=['Unnamed: 32'])

In [7]:
print(df.shape)
print(df.isnull().sum())


(569, 31)
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [8]:
le = LabelEncoder() 
df['diagnosis'] = le.fit_transform(df['diagnosis']) # Encodes the diagnosis labels into numerical values

In [9]:
# Split features and target values
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [11]:
# Normalizing the data

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
log_model = LogisticRegression(max_iter=1000) # max_iter for maximum number of optimization steps
log_model.fit(X_train, y_train)

y_val_pred_log = log_model.predict(X_test) # gives you the final class label (0 or 1)
y_val_prob_log = log_model.predict_proba(X_test)[:,1] # gives you the probability of each class

print("==== Logistic Regression Validation Metrics ====")
print(classification_report(y_test, y_val_pred_log))
auc_log = roc_auc_score(y_test, y_val_prob_log) # AUC-ROC score measures how well the model separates the two classes
print(f"AUC-ROC: {auc_log:.3f}")

==== Logistic Regression Validation Metrics ====
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        72
           1       0.97      0.93      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

AUC-ROC: 0.996
