<a href="https://colab.research.google.com/github/Nellie87/Cervical-Cancer-Treatment-Recommendation/blob/master/Cervical_serious.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library and file part


Libraries

In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from google.colab import drive
from google.colab import files
import gc
import os

In [3]:
# Step 1: Install required libraries
!pip install openpyxl dask



Mount file

In [5]:
# Step 2: Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Step 3: Verify file existence
data_path = '/content/drive/MyDrive/Hackathon/Cervical.xlsx'
if not os.path.exists(data_path):
    raise FileNotFoundError(f"File not found at {data_path}. Please check the path or upload the file.")

Mounted at /content/drive


# Dataset cleaning

Some errors in the dataset, hence need for cleaning included:
1. Removing extra whitespaces in HPV Test Result
2. Typos such as 'COLOSCOPY BIOSY' instead of 'COLPOSCOPY BIOPSY'
3. Converts numerical columns (Age, Sexual Partners, First Sexual Activity Age) to integers for consistency.
4. Removing outliers in the age
5. Biopsy Typo

Regular expression module — it’s what lets us do pattern-based text matching and replacement instead of just exact string matches.

In [9]:
import re

In [10]:
# Step 4: Load Excel file
try:
    data = pd.read_excel(data_path, engine='openpyxl')
except Exception as e:
    print(f"Error loading Excel file: {e}")
    print("If this is a CSV, try pd.read_csv() with appropriate encoding (e.g., encoding='latin1').")
    raise

# Step 5: Inspect and drop empty or irrelevant columns (e.g., Unnamed: 12)
print("Columns before dropping:", data.columns)
# Check contents of Unnamed: 12
if 'Unnamed: 12' in data.columns:
    print("Unique values in Unnamed: 12:", data['Unnamed: 12'].unique())
    # Drop if mostly empty or irrelevant
    if data['Unnamed: 12'].isna().sum() > 0.9 * len(data) or data['Unnamed: 12'].astype(str).str.strip().eq('').sum() > 0.9 * len(data):
        data = data.drop(columns=['Unnamed: 12'])
        print("Dropped Unnamed: 12 as it’s mostly empty or irrelevant.")

Columns before dropping: Index(['Patient ID', 'Age', 'Sexual Partners', 'First Sexual Activity Age',
       'HPV Test Result', 'Pap Smear Result', 'Smoking Status', 'STDs History',
       'Region', 'Insrance Covered', 'Screening Type Last',
       'Recommended Action', 'Unnamed: 12'],
      dtype='object')
Unique values in Unnamed: 12: [nan ' ']
Dropped Unnamed: 12 as it’s mostly empty or irrelevant.


In [12]:
# Step 1: Load CSV with Dask first (important!)
try:
    data = dd.read_csv(
        csv_path,
        dtype={'Unnamed: 12': 'object'}  # Avoid dtype issues
    ).compute()
except Exception as e:
    print(f"Error loading CSV with Dask: {e}")
    raise

# Step 2: Define known typo corrections with regex patterns
typo_patterns = {
    r'\bCOLOSCOPY\b': 'COLPOSCOPY',
    r'\bCOLPOSOCPY\b': 'COLPOSCOPY',
    r'\bCOLPOSCPY\b': 'COLPOSCOPY',
    r'\bCOLPOSCOPY\b': 'COLPOSCOPY',  # already correct, ensures case matching
    r'\bBIOSY\b': 'BIOPSY',
    r'\bBIOSPY\b': 'BIOPSY',
    r'\bBIOPCY\b': 'BIOPSY',
    r'\bANUAL\b': 'ANNUAL',
    r'\bYEARS\b': 'YEARS',  # keep correct form
    r'\b3YEARS\b': '3 YEARS',
    r'\bFORCOLPOSCOPY\b': 'FOR COLPOSCOPY',
}

def clean_recommended_action(text):
    if pd.isna(text):
        return text
    text = text.upper().strip()
    for pattern, correct in typo_patterns.items():
        text = re.sub(pattern, correct, text)
    return text

# Step 3: Apply cleaning AFTER loading data
if 'Recommended Action' in data.columns:
    data['Recommended Action'] = data['Recommended Action'].astype(str).replace({'NAN': np.nan})
    data['Recommended Action'] = data['Recommended Action'].apply(clean_recommended_action)

# Debug preview
print("Unique Recommended Actions after smart correction:")
print(data['Recommended Action'].unique())

# Step 4: Optimize data types
data['Age'] = data['Age'].astype('int32', errors='ignore')
data['Sexual Partners'] = data['Sexual Partners'].astype('int32', errors='ignore')
data['First Sexual Activity Age'] = data['First Sexual Activity Age'].astype('int32', errors='ignore')
for col in ['HPV Test Result', 'Pap Smear Result', 'Smoking Status', 'STDs History', 'Insurance Covered', 'Region', 'Screening Type Last', 'Recommended Action']:
    if col in data.columns:
        data[col] = data[col].astype('category')

print("Memory usage after type optimization:", data.memory_usage(deep=True).sum() / 1024**2, "MB")

# Step 5: Clean "HPV Test Result"
if 'HPV Test Result' in data.columns:
    data['HPV Test Result'] = data['HPV Test Result'].astype(str).str.strip().str.upper()
    data['HPV Test Result'] = data['HPV Test Result'].replace('"POSITIVE\\n"', 'POSITIVE')
    data['HPV Test Result'] = data['HPV Test Result'].replace({'POSITIVE': 'POSITIVE', 'NEGATIVE': 'NEGATIVE', 'NAN': np.nan})
    print("Unique HPV Test Results:", data['HPV Test Result'].unique())

# Step 6: Handle unrealistic values in "First Sexual Activity Age"
if 'First Sexual Activity Age' in data.columns:
    unrealistic_low = data['First Sexual Activity Age'] < 10
    unrealistic_high = data['First Sexual Activity Age'] > 50
    print(f"Unrealistic low ages (<10): {unrealistic_low.sum()} rows")
    print(f"Unrealistic high ages (>50): {unrealistic_high.sum()} rows")
    median_age = data[(data['First Sexual Activity Age'] >= 10) & (data['First Sexual Activity Age'] <= 50)]['First Sexual Activity Age'].median()
    data.loc[unrealistic_low | unrealistic_high, 'First Sexual Activity Age'] = median_age
    print(f"Imputed unrealistic ages with median: {median_age}")

# Step 7: Handle missing values
missing_counts = data.isnull().sum()
print("Missing values per column:\n", missing_counts)
for col in ['Age', 'Sexual Partners', 'First Sexual Activity Age']:
    if col in data.columns and missing_counts[col] > 0:
        data[col] = data[col].fillna(data[col].median())
for col in ['HPV Test Result', 'Pap Smear Result', 'Smoking Status', 'STDs History', 'Insurance Covered', 'Region', 'Screening Type Last']:
    if col in data.columns and missing_counts[col] > 0:
        data[col] = data[col].fillna(data[col].mode().iloc[0])
if 'Recommended Action' in data.columns and missing_counts['Recommended Action'] > 0:
    data = data.dropna(subset=['Recommended Action'])

# Step 8: Standardize categorical columns
binary_cols = ['Pap Smear Result', 'Smoking Status', 'STDs History', 'Insurance Covered']
for col in binary_cols:
    if col in data.columns:
        data[col] = data[col].astype(str).str.strip().str.upper()
        if not data[col].isin(['Y', 'N', 'nan']).all():
            print(f"Warning: Non-standard values in {col}: {data[col].unique()}")
if 'Region' in data.columns:
    data['Region'] = data['Region'].astype(str).str.strip().str.title()
if 'Screening Type Last' in data.columns:
    data['Screening Type Last'] = data['Screening Type Last'].astype(str).str.strip().str.upper()
print("Unique Regions:", data['Region'].unique() if 'Region' in data.columns else "Region column not found")
print("Unique Screening Types:", data['Screening Type Last'].unique() if 'Screening Type Last' in data.columns else "Screening Type Last column not found")

# Step 9: Drop unnecessary columns
data = data.drop(columns=['Patient ID', 'Unnamed: 12'], errors='ignore')
print("Columns after dropping:", data.columns)

# Step 10: Save cleaned dataset
cleaned_path = '/content/drive/MyDrive/Hackathon/cleaned_cervical_cancer_dataset.csv'
data.to_csv(cleaned_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_path}")

# Step 11: Download file
files.download(cleaned_path)

# Step 12: Free memory
del data
gc.collect()

Unique Recommended Actions after smart correction:
['REPEAT PAP SMEAR IN 3 YEARS AND FOR HPV VACCINE'
 'FOR HPV VACCINE AND SEXUAL EDUCATION'
 'FOR HPV VACCINE, LIFESTYLE AND SEXUAL EDUCATION'
 'FOR COLPOSCOPY CYTOLOGY AND BIOPSY' 'REPEAT PAP SMEAR IN 3 YEARS'
 'FOR COLPOSCOPY BIOPSY, CYTOLOGY+/- TAH'
 'FOR BIOPSY AND CYTOLOGY WITH TAH NOT RECOMMENDED'
 'FOR COLPOSCOPY BIOPSY, CYTOLOGY'
 'FOR ANNUAL FOLLOW UP AND PAP SMEAR IN 3 YEARS' 'FOR PAP SMEAR'
 'FOR COLPOSCOPY BIOPSY, CYTOLOGY +/- TAH' 'FOR LASER THERAPY'
 'FOR HPV VACCINATION AND SEXUAL EDUCATION'
 'FOR COLPOSCOPY, CYTOLOGY THEN LASER THERAPY'
 'FOR COLPOSCOPY BIOPSY, CYTOLOGY WITH TAH NOT RECOMMENDED'
 'FOR REPEAT HPV TESTING ANNUALLY AND PAP SMEAR IN 3 YEARS'
 'FOR COLPOSCOPY BIOPSY, CYTOLOGY +/-TAH']
Memory usage after type optimization: 0.007657051086425781 MB
Unique HPV Test Results: ['NEGATIVE' 'POSITIVE' 'NEGAGTIVE']
Unrealistic low ages (<10): 2 rows
Unrealistic high ages (>50): 1 rows
Imputed unrealistic ages with medi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

227