### Libraies used

In [1]:
#libraries used 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score



# Data Preporcessing Steps 

Adding data file

In [2]:
# Load the dataset
df = pd.read_csv("risk_factors_cervical_cancer.csv")  
# Reads CSV file into a pandas DataFrame called df 

# Display value counts for the target column 'Biopsy'
class_counts = df['Biopsy'].value_counts()  
# This counts how many times each value (0 or 1) appears in the 'Biopsy' column
# know how many "no cancer" and "cancer" cases 

class_percentages = df['Biopsy'].value_counts(normalize=True) * 100  
# This does the same thing but in percentages

print("Class Distribution (Count):")
print(class_counts)  
# This prints the actual count 

print("\nClass Distribution (Percentage):")
print(class_percentages)  
# This prints how much % of data is positive vs. negative class


Class Distribution (Count):
Biopsy
0    803
1     55
Name: count, dtype: int64

Class Distribution (Percentage):
Biopsy
0    93.589744
1     6.410256
Name: proportion, dtype: float64


To check the number of null values

In [3]:
df.replace('?', np.nan, inplace=True) 
df.isnull().sum()

Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

The Features in the dataset

In [4]:
num_cols = [
    'Age', 'Number of sexual partners', 'First sexual intercourse', 
    'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)',
    'Hormonal Contraceptives (years)', 'IUD (years)', 
    'STDs (number)', 'STDs: Number of diagnosis'
]

binary_cols = [
    'Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis',
    'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
    'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B',
    'STDs:HPV', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 
    'Hinselmann', 'Schiller', 'Citology', 'Biopsy'
]

Checking outliers to decidide if i need to use mean or median for imputation in the numerical features

In [5]:
for col in num_cols:
    # Convert columns to numeric in the copy, coercing errors (NaN)
    df[col] = pd.to_numeric(df[col], errors='coerce')

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    print(f'{col}: {len(outliers)} outliers')

print(f'Shape of original df: {df.shape}')
print(f'Shape of cleaned df: {df.shape}')


Age: 8 outliers
Number of sexual partners: 68 outliers
First sexual intercourse: 41 outliers
Num of pregnancies: 10 outliers
Smokes (years): 123 outliers
Smokes (packs/year): 123 outliers
Hormonal Contraceptives (years): 68 outliers
IUD (years): 83 outliers
STDs (number): 79 outliers
STDs: Number of diagnosis: 71 outliers
Shape of original df: (858, 36)
Shape of cleaned df: (858, 36)


From the above code, there are outliers so I will use median to fill the missing values in the numerical fearures.

In [6]:
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # converts '?' to NaN
    df[col] = df[col].fillna(df[col].median()) # fill with median


Using mode to fill the catagorial values

In [7]:
for col in binary_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert '?' to NaN
    mode_val = df[col].mode()[0]  # get the first mode value (0 or 1)
    df[col] = df[col].fillna(mode_val)  # fill NaNs with mode



I found that 2 of the columans have lots of missing values. Checking the parcentage here to decide if i should drop or not.

STDs: Time since first diagnosis      787
STDs: Time since last diagnosis       787 (over like 90% missing. Dropping since it will mess with the data ).


In [8]:
df.drop(columns=['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], inplace=True)


Checking nulls again

In [9]:
df.isnull().sum()


Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


Checking duplicated rows

In [10]:
dupes = df.duplicated().sum()
print(f"Found {dupes} duplicate rows!")

Found 28 duplicate rows!


In [11]:
df.drop_duplicates(inplace=True)
#droping the duplicated rows

In [12]:
dupes = df.duplicated().sum()
print(f"Found {dupes} duplicate rows!")

Found 0 duplicate rows!


Making sure the features have the correct data types. (not necessary in the decision tree but just for more cleaning)

In [13]:
print(df.dtypes)


Age                                     int64
Number of sexual partners             float64
First sexual intercourse              float64
Num of pregnancies                    float64
Smokes                                float64
Smokes (years)                        float64
Smokes (packs/year)                   float64
Hormonal Contraceptives               float64
Hormonal Contraceptives (years)       float64
IUD                                   float64
IUD (years)                           float64
STDs                                  float64
STDs (number)                         float64
STDs:condylomatosis                   float64
STDs:cervical condylomatosis          float64
STDs:vaginal condylomatosis           float64
STDs:vulvo-perineal condylomatosis    float64
STDs:syphilis                         float64
STDs:pelvic inflammatory disease      float64
STDs:genital herpes                   float64
STDs:molluscum contagiosum            float64
STDs:AIDS                         

In [14]:
df = df.astype(float)


In [15]:
print(df.dtypes)
#checking again

Age                                   float64
Number of sexual partners             float64
First sexual intercourse              float64
Num of pregnancies                    float64
Smokes                                float64
Smokes (years)                        float64
Smokes (packs/year)                   float64
Hormonal Contraceptives               float64
Hormonal Contraceptives (years)       float64
IUD                                   float64
IUD (years)                           float64
STDs                                  float64
STDs (number)                         float64
STDs:condylomatosis                   float64
STDs:cervical condylomatosis          float64
STDs:vaginal condylomatosis           float64
STDs:vulvo-perineal condylomatosis    float64
STDs:syphilis                         float64
STDs:pelvic inflammatory disease      float64
STDs:genital herpes                   float64
STDs:molluscum contagiosum            float64
STDs:AIDS                         

### Feature Selection

In [16]:
# Combine both lists into one
all_feature_cols = num_cols + binary_cols

# Select only those columns from full DataFrame
X = df[all_feature_cols].drop('Biopsy', axis=1)
y = df['Biopsy'] # target (indicates positive cervical biopsy (actual presence of cancer)

Since these columans gave UserWarning: Features [15 22] are constant, I am dropping them using variance treshold.

In [17]:
from sklearn.feature_selection import VarianceThreshold

# This removes features with zero variance
constant_filter = VarianceThreshold(threshold=0.0)
constant_filter.fit(X)
constant_cols = [column for column in X.columns if column not in X.columns[constant_filter.get_support()]]

print("Zero-variance features to drop:", constant_cols)
X.drop(columns=constant_cols, inplace=True)

Zero-variance features to drop: ['STDs:cervical condylomatosis', 'STDs:AIDS']


In [18]:
from sklearn.feature_selection import SelectKBest, f_classif

# Initialize SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)

# Fit and transform
X_selected = selector.fit_transform(X, y)

# Get selected column names
mask = selector.get_support()
selected_features = X.columns[mask]

print("Top 10 Selected Features:")
print(selected_features)

Top 10 Selected Features:
Index(['STDs', 'STDs:genital herpes', 'STDs:HIV', 'Dx:Cancer', 'Dx:CIN',
       'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology'],
      dtype='object')


# Training the decision tree model 

- Targeting column
- Split with 8:2 ratio

In [19]:
# Right after feature selection:
X = df[selected_features]
y = df['Biopsy']

# Then do the train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

Using SMOTE (Synthetic Minority Oversampling Technique) is used to fix class imbalance in this set. 94% to 6% imbalance.. ONLY IN THE TRAINING

In [20]:
# Initialize SMOTE with random_state=42 for reproducible results
sm = SMOTE(random_state=42)

# Resample the training data to fix class imbalance
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

Begin training for the decision tree


In [21]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

Testing the model 


In [22]:
y_pred = model.predict(X_test)

# Model Evaluation

In [23]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       155
         1.0       0.73      0.73      0.73        11

    accuracy                           0.96       166
   macro avg       0.85      0.85      0.85       166
weighted avg       0.96      0.96      0.96       166

Confusion Matrix:
 [[152   3]
 [  3   8]]


The model struggles with the minority (positive) class, missing too many positive cases and generating too many false alarms. This is critical in medical settings, where both missing cases and false alarms carry serious risks. Improving class balance or trying other techniques could help.

### Trying Threshold Tuning

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 4, 6]
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(clf, param_grid, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)  # <-- Run this so grid_search exists!


# Get predicted probabilities for the positive class (1) from the best model after grid search
y_probs = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

# Define a range of thresholds from 0.1 to 0.9 in steps of 0.05 to test
thresholds = np.arange(0.1, 0.91, 0.05)

# Initialize variables to track the best F1 score and the corresponding threshold
best_f1 = 0
best_threshold = 0.5

print("Trying different thresholds to find the best F1 score...")

# Loop over each threshold to calculate precision, recall, and F1 score
for thresh in thresholds:
    
    # Predict positive if probability >= current threshold, else negative
    y_pred_thresh = (y_probs >= thresh).astype(int)
    
    # Calculate precision, recall, and F1 score for predictions at this threshold
    precision = precision_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    f1 = f1_score(y_test, y_pred_thresh)
    
    print(f"Threshold: {thresh:.2f} | Precision: {precision:.2f} | Recall: {recall:.2f} | F1 Score: {f1:.2f}")
    
    # Update best F1 score and threshold if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thresh

print(f"\n Best threshold is {best_threshold:.2f} with F1 score: {best_f1:.2f} ")

# Use the best threshold to make final binary predictions on test data
y_pred_best = (y_probs >= best_threshold).astype(int)


Trying different thresholds to find the best F1 score...
Threshold: 0.10 | Precision: 0.75 | Recall: 0.82 | F1 Score: 0.78
Threshold: 0.15 | Precision: 0.75 | Recall: 0.82 | F1 Score: 0.78
Threshold: 0.20 | Precision: 0.75 | Recall: 0.82 | F1 Score: 0.78
Threshold: 0.25 | Precision: 0.75 | Recall: 0.82 | F1 Score: 0.78
Threshold: 0.30 | Precision: 0.82 | Recall: 0.82 | F1 Score: 0.82
Threshold: 0.35 | Precision: 0.82 | Recall: 0.82 | F1 Score: 0.82
Threshold: 0.40 | Precision: 0.82 | Recall: 0.82 | F1 Score: 0.82
Threshold: 0.45 | Precision: 0.88 | Recall: 0.64 | F1 Score: 0.74
Threshold: 0.50 | Precision: 0.83 | Recall: 0.45 | F1 Score: 0.59
Threshold: 0.55 | Precision: 0.83 | Recall: 0.45 | F1 Score: 0.59
Threshold: 0.60 | Precision: 0.83 | Recall: 0.45 | F1 Score: 0.59
Threshold: 0.65 | Precision: 0.80 | Recall: 0.36 | F1 Score: 0.50
Threshold: 0.70 | Precision: 1.00 | Recall: 0.18 | F1 Score: 0.31
Threshold: 0.75 | Precision: 1.00 | Recall: 0.18 | F1 Score: 0.31
Threshold: 0.80 | P

In [25]:
# Get the predicted probabilities for the positive class (1) from best tuned model
y_probs = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

# Use the best threshold found earlier to convert probabilities into binary predictions
y_pred_best = (y_probs >= best_threshold).astype(int)

print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))


              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       155
         1.0       0.82      0.82      0.82        11

    accuracy                           0.98       166
   macro avg       0.90      0.90      0.90       166
weighted avg       0.98      0.98      0.98       166

[[153   2]
 [  2   9]]


The precision, recall and f1-score for 1 went up.

# **K-Fold Cross Validation**

In [26]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', DecisionTreeClassifier(random_state=42))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = cross_val_score(pipeline, X_selected, y, cv=skf, scoring='f1')
print("Decision Tree F1 Scores with SMOTE:", f1_scores)
print("Mean F1 Score with SMOTE: {:.2f}".format(np.mean(f1_scores)))


Decision Tree F1 Scores with SMOTE: [0.76190476 0.5625     0.63636364 0.72727273 0.74074074]
Mean F1 Score with SMOTE: 0.69


In [27]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid (note the double underscores __ for pipeline steps)
param_grid = {
    'clf__max_depth': [6, 8, 10, 12, 15],
    'clf__min_samples_split': [2, 5, 10, 15],
    'clf__min_samples_leaf': [1, 2, 4, 6]  # new hyperparam to prevent overfitting
}


# Create GridSearchCV with pipeline, using f1 scoring and Stratified K-Fold CV
grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=skf)

# Fit grid search on full dataset features and labels
grid_search.fit(X_selected, y)

# Print best hyperparameters and best F1 score found during grid search
print("Best Params:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)


Best Params: {'clf__max_depth': 6, 'clf__min_samples_leaf': 6, 'clf__min_samples_split': 2}
Best F1 Score: 0.7178095238095239


- **First cell:**  
  build the pipeline with SMOTE and Decision Tree, then use cross-validation to check the model’s F1 scores.  
  This gives a quick idea of how well the model performs with balanced data.

- **Second cell:**  
  run a Grid Search on the same pipeline to **tune the hyperparameters** (like max depth, min samples split, etc.) to improve the model’s F1 score.  
  This takes longer but helps find the best model settings.

The best parameters found for the Decision Tree are:

Maximum depth of the tree: 6

Minimum samples required at a leaf node: 6

Minimum samples needed to split a node: 2 

With these settings, the model achieved the best F1 score of about 0.72. This means the model is pretty good at balancing precision and recall, making reliable predictions without too many mistakes.


In [28]:
import joblib
import os

# Create folders if they do not exist
os.makedirs("../backend/model_store/dt", exist_ok=True)

# Save best tuned Decision Tree model
joblib.dump(grid_search.best_estimator_,
            "../backend/model_store/dt/model.pkl")

# Save feature order
joblib.dump(list(selected_features),
            "../backend/model_store/dt/features.pkl")

print("Decision Tree model artifacts saved successfully.")


Decision Tree model artifacts saved successfully.
