MAI643 - Artificial Intelligence in Medicine

Project Assignment 1 - Spring Semester 2024

Student Name:    
Christina Ioanna Saroglaki   
Jianlin Ye 

UCY Email:     
saroglaki.christina-ioanna@ucy.ac.cy    
jye00001@ucy.ac.cy 

### Import Libararies

In [None]:
import pandas as pd 
import numpy as np

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import StratifiedShuffleSplit

np.set_printoptions(formatter={'float':"{:6.5g}".format})

### Import dataset

In [None]:
risk_factor_df = pd.read_csv("risk_factors_cervical_cancer.csv")

# Only keep the "Biopsy" column as the target variable
risk_factor_df = risk_factor_df.drop(columns=["Hinselmann","Schiller","Citology"])

Tranformed all the numeric values into  the correct numeric type.

In [None]:
risk_factor_df = risk_factor_df.apply(pd.to_numeric, errors = "coerce")

Split dataset to features and target variables.

In [None]:
feature_df = risk_factor_df.iloc[:,:-4]
dep_df = risk_factor_df.iloc[:,-4:]

### Handling missing values

During the preliminary analysis identified that the features “STDs: Time since first diagnosis” and “STDs: Time since last diagnosis” were filled with NaN values to about 92%. Because of the high percentage of missing values, it impractical to either eliminate those observations or fill the missing data with the mean of the existing data. Consequently, these features were excluded from the dataset for the development of the models.

In [None]:
risk_factor_df = risk_factor_df.drop(columns=["STDs: Time since first diagnosis", "STDs: Time since last diagnosis"])

For the remaining columns, we managed the missing values depending on the column. In more detail, if the column contained binary values (0,1) then the row containing the missing value was deleted. Otherwise, the missing value was replaced with the mean of the column.

In [None]:
print("--------------------------- Handling Missing Values ---------------------------")
print("----------------------------------- BEFORE -----------------------------------")
print("Number of rows before filling missing values: ", len(risk_factor_df))

# Display the number of missing values before filling
print("\nNumber of missing values per column before filling:")
print(risk_factor_df.isnull().sum())

# Fill missing values depending on the column
for col in risk_factor_df.columns:
    # If the column has more than 3 unique values, fill with mean of the column
    if risk_factor_df[col].nunique() > 3:
        risk_factor_df[col].fillna(risk_factor_df[col].median(), inplace=True)
    
# Drop rest NaN containing rows
risk_factor_df=risk_factor_df.dropna()
risk_factor_df.reset_index(drop=True, inplace=True)

In [None]:
print("\n----------------------------------- AFTER -----------------------------------")
print("Number of rows after filling missing values: ", len(risk_factor_df))

# Display the number of missing values after filling
print("\nNumber of missing values per column after filling:")
print(risk_factor_df.isnull().sum())

### Duplicate Rows

Removing duplicate rows from the dataset.

In [None]:
print("----------------------------------- Duplicate Rows -----------------------------------")
# Check for duplicate rows
duplicate_rows = risk_factor_df.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicate_rows.sum()

if num_duplicates == 0:
    print("No duplicate rows found in the dataset.")
else:
    print(f"Found {num_duplicates} duplicate rows in the dataset.\n")

    # Display the duplicate rows indexes (if any)
    print("Duplicate rows indexes: {}\n".format(risk_factor_df[duplicate_rows].index.values))

    # Removing duplicate rows
    print("----------------------------- Removing Duplicates ----------------------------")
    print("----------------------------------- BEFORE -----------------------------------")
    print("Number of rows before removing duplicates: ", len(risk_factor_df))

    risk_factor_df.drop_duplicates(inplace=True)
    risk_factor_df.reset_index(drop=True, inplace=True)

    print("\n----------------------------------- AFTER -----------------------------------")
    print("Number of rows after removing duplicates: ", len(risk_factor_df))


#### Remove Outliers

IQR (Inter Quartile Range) Inter Quartile Range approach to finding the outliers is the most commonly used and most trusted approach used in the research field. We utilised IQR to identify and remove outliers.

#### define Unique Features

In [None]:
# Function finding the unique values of each column in the dataframe
def find_unique_values_df(feat: pd.DataFrame):
    return {col: feat[col].unique() for col in feat}

print("----------------------------------- Unique Values -----------------------------------")    
# Unique Values
unique_vals = find_unique_values_df(risk_factor_df)

# Print unique values for each column
for col, col_unique_vals in unique_vals.items():
    print(f"{col}:")
    print(col_unique_vals)
    print(risk_factor_df[col].dtypes)
    print()


In [None]:
def find_outliers(col, indices):
    obs = risk_factor_df[col].iloc[indices]
    unique_items, counts = np.unique(obs, return_counts=True)
    unique_items, counts = unique_items[::-1], counts[::-1]

    values_to_delete = unique_items[counts < 2 ]
    return values_to_delete

def delete_outliers(col, to_delete):
    if (to_delete.size != 0):
        rows_to_del = risk_factor_df.loc[risk_factor_df[col].isin(to_delete)].index.values.tolist()

        # Remove rows
        risk_factor_df.drop(rows_to_del, inplace=True)
        risk_factor_df.reset_index(drop=True, inplace=True)

# Identify non-binary columns
non_binary_cols = [col for col, vals in unique_vals.items() if len(vals) > 2]

for col in non_binary_cols:

    # IQR cannot be applied to columns with median 0
    if (risk_factor_df[col].median() != 0):

        # Plot values distribution
        out_dist = px.histogram(risk_factor_df, x=col,
            marginal="box",
            color_discrete_sequence= px.colors.sequential.thermal)
        out_dist.update_layout(bargap=0.2,
            width=700)
        out_dist.show()

        Q3, Q1 = np.percentile(risk_factor_df[col], [75 ,25])
        IQR = Q3-Q1

        upper = Q3+(1.5*IQR)
        lower = Q1-(1.5*IQR)

        print(col)
        print("median: {}, upper fence: {}, lower fence: {}".format(risk_factor_df[col].median(), upper, lower))

        #Delete one occurrence observations outside the upper fence as outliers
        upper_to_delete = find_outliers(col, np.where(risk_factor_df[col] > upper)[0])
        delete_outliers(col, upper_to_delete)

        
        #Delete one occurrence observations outside the lower fence as outliers
        lower_to_delete = find_outliers(col, np.where(risk_factor_df[col] < lower)[0])
        delete_outliers(col, lower_to_delete)


In [None]:
print("\nFinal dataset size: {} cols, {} rows".format(risk_factor_df.shape[1], risk_factor_df.shape[0]))