In [2]:
import pandas as pd
import numpy as np

# Data Preporessing steps

In [3]:
# Load the dataset
df = pd.read_csv("risk_factors_cervical_cancer.csv")  
# Reads CSV file into a pandas DataFrame called df 


### To check the imbalance of the data.


In [4]:
# Display value counts for the target column 'Biopsy'
class_counts = df['Biopsy'].value_counts()  
# This counts how many times each value (0 or 1) appears in the 'Biopsy' column
# know how many "no cancer" and "cancer" cases 

class_percentages = df['Biopsy'].value_counts(normalize=True) * 100  
# This does the same thing but in percentages
# `normalize=True` means “don’t just count—show what fraction each class is”
# Multiplying by 100 turns it into pretty percentage numbers 

print("Class Distribution (Count):")
print(class_counts)  
# This prints the actual count 

print("\nClass Distribution (Percentage):")
print(class_percentages)  
# This prints how much % of data is positive vs. negative class

Class Distribution (Count):
Biopsy
0    803
1     55
Name: count, dtype: int64

Class Distribution (Percentage):
Biopsy
0    93.589744
1     6.410256
Name: proportion, dtype: float64


### Data cleaning Step 1. Check the null, if yes replace with np.nan.

In [5]:
df.replace('?', np.nan, inplace=True)

df.isnull().sum()


Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

### Seting the differnt column types for data imputation.

In [6]:
num_cols = [
    'Age', 'Number of sexual partners', 'First sexual intercourse', 
    'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)',
    'Hormonal Contraceptives (years)', 'IUD (years)', 
    'STDs (number)', 'STDs: Number of diagnosis'
]

binary_cols = [
    'Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis',
    'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
    'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B',
    'STDs:HPV', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 
    'Hinselmann', 'Schiller', 'Citology', 'Biopsy'
]


### Checking rather if the data has outliers or not. If they have ourliers, using median. Since it is medical data, I decied to keep the outliers since it can be rare cases.

In [7]:
# Make a copy so you keep original safe
df_clean = df.copy()

for col in num_cols:
    # Convert columns to numeric in the copy, coercing errors (NaN)
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1

    outliers = df_clean[(df_clean[col] < Q1 - 1.5 * IQR) | (df_clean[col] > Q3 + 1.5 * IQR)]
    print(f'{col}: {len(outliers)} outliers')

print(f'Shape of original df: {df.shape}')
print(f'Shape of cleaned df: {df_clean.shape}')

Age: 8 outliers
Number of sexual partners: 68 outliers
First sexual intercourse: 41 outliers
Num of pregnancies: 10 outliers
Smokes (years): 123 outliers
Smokes (packs/year): 123 outliers
Hormonal Contraceptives (years): 68 outliers
IUD (years): 83 outliers
STDs (number): 79 outliers
STDs: Number of diagnosis: 71 outliers
Shape of original df: (858, 36)
Shape of cleaned df: (858, 36)


Since there are ouliers, I am using median.

In [8]:
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # converts '?' to NaN
    df[col] = df[col].fillna(df[col].median()) # fill with median

Using mode to fill catagorial nulls

In [9]:
for col in binary_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # convert '?' to NaN
    mode_val = df[col].mode()[0]  # get the first mode value (0 or 1)
    df[col] = df[col].fillna(mode_val)  # fill NaNs with mode

I found that 2 of the columans have lots of missing values. Checking the parcentage here to decide if i should drop or not.

STDs: Time since first diagnosis      787
STDs: Time since last diagnosis       787 (over like 90% missing. Dropping since it will mess with the data)

In [10]:
df.drop(columns=['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], inplace=True)

Checking nulls again

In [11]:
df.isnull().sum()


Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


## Data cleaning step 2: Checking for duplicated data - drop them if found

In [12]:
dupes = df.duplicated().sum()
print(f"Found {dupes} duplicate rows!")

df.drop_duplicates(inplace=True)
#droping the duplicated rows

dupes = df.duplicated().sum()
print(f"Found {dupes} duplicate rows!")

Found 28 duplicate rows!
Found 0 duplicate rows!


### Changing all the featuer types to float to avoid errors

-no need for encoding since there are no string values.

In [13]:
df = df.astype(float)
print(df.dtypes)

Age                                   float64
Number of sexual partners             float64
First sexual intercourse              float64
Num of pregnancies                    float64
Smokes                                float64
Smokes (years)                        float64
Smokes (packs/year)                   float64
Hormonal Contraceptives               float64
Hormonal Contraceptives (years)       float64
IUD                                   float64
IUD (years)                           float64
STDs                                  float64
STDs (number)                         float64
STDs:condylomatosis                   float64
STDs:cervical condylomatosis          float64
STDs:vaginal condylomatosis           float64
STDs:vulvo-perineal condylomatosis    float64
STDs:syphilis                         float64
STDs:pelvic inflammatory disease      float64
STDs:genital herpes                   float64
STDs:molluscum contagiosum            float64
STDs:AIDS                         

### Performing feature selection

In [14]:
# Combine both lists into one
all_feature_cols = num_cols + binary_cols

# Select only those columns from full DataFrame
X = df[all_feature_cols].drop('Biopsy', axis=1)
y = df['Biopsy'] # target (indicates positive cervical biopsy (actual presence of cancer))

Since these columans gave UserWarning: Features [15 22] are constant, I am dropping them. 

In [15]:
from sklearn.feature_selection import VarianceThreshold

# This removes features with zero variance
constant_filter = VarianceThreshold(threshold=0.0)
constant_filter.fit(X)
constant_cols = [column for column in X.columns if column not in X.columns[constant_filter.get_support()]]

print("Zero-variance features to drop:", constant_cols)
X.drop(columns=constant_cols, inplace=True)


Zero-variance features to drop: ['STDs:cervical condylomatosis', 'STDs:AIDS']


In [16]:
from sklearn.feature_selection import SelectKBest, f_classif

# Initialize SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)

# Fit and transform
X_selected = selector.fit_transform(X, y)

# Get selected column names
mask = selector.get_support()
selected_features = X.columns[mask]

print("Top 10 Selected Features:")
print(selected_features)

Top 10 Selected Features:
Index(['STDs', 'STDs:genital herpes', 'STDs:HIV', 'Dx:Cancer', 'Dx:CIN',
       'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology'],
      dtype='object')


 ## Using SVM for model traing..

Step 1. Spliting Data


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y)  # keep class balance


Step 2. Adding feature scaling since it is sensitive to it.

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Step 3. Starting the training with svm 

In [19]:
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

# Apply SMOTE to training data
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)

# Train the SVM model (only once!)
svm_model = SVC(kernel='linear', C=1.0, gamma='scale', class_weight='balanced')
svm_model.fit(X_train_resampled, y_train_resampled)


Step 4. Evaluating of the model

In [20]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = svm_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[151   4]
 [  1  10]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.97      0.98       155
         1.0       0.71      0.91      0.80        11

    accuracy                           0.97       166
   macro avg       0.85      0.94      0.89       166
weighted avg       0.97      0.97      0.97       166



I made some changed to get the precisionand recall as high as I can but due to the highly unbalanced data, I just stoped at 0.71 precision.

Things I did for the unbalanced dataset

- SMOTE
- added weight
- chnaged rbf to linear 

Overall it performs okay but would be better if the data was balanced.

### K-Fold cross validation (Important for the unbalanced data I have)

In [21]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('svm', SVC(kernel='linear', class_weight='balanced'))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #5 foldsss!!!

# Accuracy score for now (we’ll get f1 next)
scores = cross_val_score(pipeline, X_selected, y, cv=skf, scoring='accuracy')

print("Fold Accuracies:", scores)
print("Mean Accuracy: {:.2f}%".format(np.mean(scores) * 100))

f1_scores = cross_val_score(pipeline, X_selected, y, cv=skf, scoring='f1')
print("F1 Scores:", f1_scores)
print("Best F1 Score: {:.2f}".format(np.max(f1_scores)))

Fold Accuracies: [0.96987952 0.92168675 0.96385542 0.95783133 0.96385542]
Mean Accuracy: 95.54%
F1 Scores: [0.7826087  0.60606061 0.76923077 0.72       0.78571429]
Best F1 Score: 0.79


**The model was evaluated using cross-validation with five folds.** 

The fold accuracies ranged from approximately 92.17% to 96.99%, resulting in a mean accuracy of 95.54%. 
This indicates the model performs consistently well across different data splits.

The F1 scores for the folds varied between 0.61 and 0.79, with a mean F1 score of 0.73. 
The F1 score reflects a balance between precision and recall, showing that the model maintains a good trade-off between correctly identifying positive cases and minimizing false positives.

Overall, the SVM model demonstrates strong accuracy and reasonable balance in classification performance.

In [None]:
import joblib
import os

# Create folders if they do not exist
os.makedirs("../backend/model_store/svm", exist_ok=True)

# Save trained SVM model
joblib.dump(svm_model, "../backend/model_store/svm/model.pkl")

# Save scaler (SVM NEEDS this)
joblib.dump(scaler, "../backend/model_store/svm/scaler.pkl")

# Save feature order
joblib.dump(list(selected_features), "../backend/model_store/svm/features.pkl")

print("SVM model artifacts saved successfully.")


SVM model artifacts saved successfully.


: 