In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
import joblib


In [9]:
df = pd.read_csv("risk_factors_cervical_cancer.csv")
df.replace("?", np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')  # Convert all to numeric
df.head()



Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
2,34,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,,,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,,,0,0,0,0,0,0,0,0


In [11]:
df.info()
df.describe()
df.isnull().sum().sort_values(ascending=False)
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 858 non-null    int64  
 1   Number of sexual partners           832 non-null    float64
 2   First sexual intercourse            851 non-null    float64
 3   Num of pregnancies                  802 non-null    float64
 4   Smokes                              845 non-null    float64
 5   Smokes (years)                      845 non-null    float64
 6   Smokes (packs/year)                 845 non-null    float64
 7   Hormonal Contraceptives             750 non-null    float64
 8   Hormonal Contraceptives (years)     750 non-null    float64
 9   IUD                                 741 non-null    float64
 10  IUD (years)                         741 non-null    float64
 11  STDs                                753 non-n

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy'],
      dtype='object')

In [23]:
from sklearn.impute import SimpleImputer

imputer_median = SimpleImputer(strategy='median')
df['Num of pregnancies'] = imputer_median.fit_transform(df[['Num of pregnancies']])


In [24]:
imputer_mode = SimpleImputer(strategy='most_frequent')
df['STDs: Number of diagnosis'] = imputer_mode.fit_transform(df[['STDs: Number of diagnosis']])


In [25]:
imputer_all = SimpleImputer(strategy='mean')  # Or use KNN imputer later if needed
df[df.columns] = imputer_all.fit_transform(df)


In [26]:
#Feature engineering
df['Biopsy'] = df['Biopsy'].astype(int)
std_cols = [col for col in df.columns if 'STD' in col and 'Number' not in col and 'diagnosis' not in col]
df['Total_STDs'] = df[std_cols].sum(axis=1)
X = df.drop(columns=['Biopsy'])
y = df['Biopsy']


In [27]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [28]:
#preprocessing pipeline
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [29]:
#Model training Logistic regression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]


In [30]:
#Model training Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]


In [31]:
#Custom evaluation function
def custom_fbeta_score(y_true, y_pred, beta=2.5):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    beta2 = beta ** 2
    return (1 + beta2) * (precision * recall) / (beta2 * precision + recall)
print("Logistic Regression F-beta:", custom_fbeta_score(y_test, y_pred_lr))
print("Random Forest F-beta:", custom_fbeta_score(y_test, y_pred_rf))


Logistic Regression F-beta: 0.7272727272727272
Random Forest F-beta: 0.38795986622073586


In [32]:
joblib.dump(rf, 'model.pkl')            # or 'lr' if better
joblib.dump(scaler, 'pipeline.pkl')     # saves StandardScaler
with open("metrics_log.txt", "w") as f:
    f.write(f"Logistic Regression F-beta: {custom_fbeta_score(y_test, y_pred_lr)}\n")
    f.write(f"Random Forest F-beta: {custom_fbeta_score(y_test, y_pred_rf)}\n")


### 🔍 Model Selection Justification

After evaluating both **Logistic Regression** and **Random Forest** using a **custom F-beta score** with **β = 2.5**, I selected **Logistic Regression** as the final model for deployment.

---

#### 🧮 Why F-beta with β = 2.5?

The F-beta score is defined as:

\[
F_{\beta} = \frac{(1 + \beta^2) \cdot (\text{precision} \cdot \text{recall})}{\beta^2 \cdot \text{precision} + \text{recall}}
\]

With **β = 2.5**, the metric emphasizes **recall** far more than precision. In the context of cervical cancer biopsy prediction, this is critical because:

> ❗ **False Negatives (missed biopsies) are far more dangerous than False Positives.**

---

#### 📊 Evaluation Results:

| Model              | F-beta Score (β = 2.5) |
|--------------------|------------------------|
| Logistic Regression| **0.727**              |
| Random Forest      | 0.388                  |

---

#### ✅ Why Logistic Regression Was Selected

- It achieved the **highest F-beta score**, meaning it strikes the best balance of precision and recall with an emphasis on recall.
- **Logistic Regression** performed better likely due to:
  - Its simplicity and inherent regularization making it **less prone to overfitting**.
  - It handled class imbalance better without extensive tuning.
- **Random Forest**, on the other hand, may have overfit the majority class or required more advanced hyperparameter tuning and resampling strategies (e.g., SMOTE, class weights).

---

#### 💾 Deployment Readiness

The selected **Logistic Regression model** has been:
- **Serialized and saved** as `model.pkl`
- Accompanied by the saved **preprocessing pipeline** as `pipeline.pkl`  
This ensures **reproducibility and consistency** during inference in the production API.

---
