MAI643 - Artificial Intelligence in Medicine

Project Assignment 1 - Spring Semester 2024

Student Name:    
Christina Ioanna Saroglaki   
Jianlin Ye 

UCY Email:     
saroglaki.christina-ioanna@ucy.ac.cy    
jye00001@ucy.ac.cy 

### Import Libararies

In [None]:
import pandas as pd 
import numpy as np

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

np.set_printoptions(formatter={'float':"{:6.5g}".format})

# Overview

As per the authors, the chosen dataset focuses on indicators associated with the diagnosis of cervical cancer, encompassing various features such as demographic information, habits, and medical records​. In more detail, the data was gathered at "Hospital Universitario de Caracas" in Venezuela from a total of 858 patients​.

C. J. Fernandes Kelwin and J. Fernandes, “Cervical cancer (Risk Factors),” UCI Machine 
Learning Repository. 2017.

In [None]:
risk_factor_df = pd.read_csv("risk_factors_cervical_cancer.csv", 
            na_values=["?"])

print("----------------------------------- Information -----------------------------------")
risk_factor_df.info()

## Preliminary analysis of the dataset

To gain a better understanding of the dataset, we conducted a preliminary analysis.
### Missing Values

First, we needed to find the volume of missing values contained in the dataset as well as the features that contained the largest amount.

In [None]:
print("----------------------------------- Missing Values -----------------------------------")
missing_info = risk_factor_df.isnull().sum()
total_nan = missing_info.sum()
total_entries = risk_factor_df.size

# Print total NaN values
if (total_nan == 0):
    print("\nNo NaN values in the dataset.")
else:
    print("\nNaN values found in the dataset.")

    print("\nTotal NaN values in dataset: {}/{}".format(total_nan, total_entries))

    # Sort columns by the number of missing values
    nan_columns = missing_info.sort_values(ascending=False)

    print("\nTop 15 columns with missing values:\n")
    for i, (col, count) in enumerate(nan_columns.head(15).items(), 1):
        print("{:2}. {:35} : {:}".format(i, col, count))

In [None]:
# Plot
total_figure = px.pie(values=[total_nan, total_entries-total_nan], names=["NaN values", "Valid Values"],
        color_discrete_sequence=px.colors.sequential.Aggrnyl,
        title="Total NaN Values Distribution",
        width=550, height= 350)

total_figure.update_layout(
    margin=dict(l=50, r=50, t=50, b=50),
    title_x=0.5    
)

total_figure.show()

In [None]:
# Rows containing NaN values
total_rows = len(risk_factor_df)
nan_rows = risk_factor_df.isna().any(axis=1).tolist().count(True)
print("\nTotal Rows containing NaN values in dataset: {}/{}".format(nan_rows, total_rows))

rows_fig=go.Figure(data=[go.Pie(labels=["Has NaN Values","Is Filled"],
    values=[nan_rows, total_rows],
    marker_colors=[px.colors.sequential.Agsunset[0], px.colors.sequential.Agsunset[1]])])

rows_fig.update_layout(
    title="NaN Containing Rows Distribution",
    margin=dict(l=50, r=50, t=50, b=50),
    title_x=0.5,
    width=550, height= 350    
)

rows_fig.show()

We identified that the features “STDs: Time since first diagnosis” and “STDs: Time since last diagnosis” were filled with NaN values of about 92%. Because of the high percentage, it was impractical to either eliminate the affected observations or fill the missing values with the mean of columns. Consequently, these features were excluded from the dataset.

In [None]:
risk_factor_df.drop(columns=["STDs: Time since first diagnosis", "STDs: Time since last diagnosis"], inplace=True)

To ensure the optimal performance of future models, we also set a missing value threshold of 10 per row. Any rows that exceeded this threshold were eliminated from the dataset because we determined they were missing significant information.

In [None]:
# Rows containing NaN values
nan_rows = risk_factor_df.isna().any(axis=1).tolist().count(True)
print("\nTotal Rows containing NaN values in dataset: {}/{}".format(nan_rows, total_rows))

# Find rows that contain more than 10 NaN values
rows_to_del = risk_factor_df[risk_factor_df.isna().sum(axis=1) > 10].index

print("\nRows containing >10 NaN values: {}/{}".format(len(rows_to_del), total_rows))

# Remove rows
risk_factor_df.drop(rows_to_del, inplace=True)
risk_factor_df.reset_index(drop=True, inplace=True)

In [None]:
#Plot
color_1 = [px.colors.sequential.Agsunset[0], px.colors.sequential.Agsunset[1]]
color_2 = [px.colors.sequential.Agsunset[2], px.colors.sequential.Agsunset[3]]


row_figure = make_subplots(1, 2, specs=[[{"type":"domain"}, {"type":"domain"}]],
    subplot_titles=["Contain NaN Values", "Contain >10 NaN Values"])

row_figure.add_trace(go.Pie(labels=["Has NaN Values","Is Filled"],
    values=[nan_rows, total_rows - nan_rows],
    marker_colors=color_1,
    pull=[0.1, 0]), 1, 1)

row_figure.add_trace(go.Pie(labels=[">10 NaN", "<10 NaN"],
    values=[len(rows_to_del), nan_rows - len(rows_to_del)],
    marker_colors=color_2), 1, 2)

row_figure.update_layout(title_text="Rows Containing NaN Values",
    width=650, height= 400,
    title_x=0.5)

row_figure.show()

For the remaining columns, we managed the missing values depending on the column. In more detail, if the column contained binary values (0,1) then the row containing the missing value was deleted. Otherwise, the missing value was replaced with the mean of the column.

In [None]:
print("--------------------------- Handling Missing Values ---------------------------")
print("----------------------------------- BEFORE -----------------------------------")
print("Number of rows before filling missing values: ", len(risk_factor_df))

# Display the number of missing values before filling
print("\nNumber of missing values per column before filling:")
print(risk_factor_df.isnull().sum())

# Fill missing values depending on the column
for col in risk_factor_df.columns:
    # If the column has more than 3 unique values, fill with mean of the column
    if risk_factor_df[col].nunique() > 3:
        risk_factor_df[col].fillna(risk_factor_df[col].median(), inplace=True)
    
# Drop rest NaN containing rows
risk_factor_df=risk_factor_df.dropna()
risk_factor_df.reset_index(drop=True, inplace=True)

In [None]:
print("\n----------------------------------- AFTER -----------------------------------")
print("Number of rows after filling missing values: ", len(risk_factor_df))

# Display the number of missing values after filling
print("\nNumber of missing values per column after filling:")
print(risk_factor_df.isnull().sum())

### Duplicate Rows

Following the missing value analysis, we examined if the dataset contained any duplicate rows and removed them from the dataset.

In [None]:
print("----------------------------------- Duplicate Rows -----------------------------------")
# Check for duplicate rows
duplicate_rows = risk_factor_df.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicate_rows.sum()

if num_duplicates == 0:
    print("No duplicate rows found in the dataset.")
else:
    print(f"Found {num_duplicates} duplicate rows in the dataset.\n")

    # Display the duplicate rows indexes (if any)
    print("Duplicate rows indexes: {}\n".format(risk_factor_df[duplicate_rows].index.values))

    # Removing duplicate rows
    print("----------------------------- Removing Duplicates ----------------------------")
    print("----------------------------------- BEFORE -----------------------------------")
    print("Number of rows before removing duplicates: ", len(risk_factor_df))

    risk_factor_df.drop_duplicates(inplace=True)
    risk_factor_df.reset_index(drop=True, inplace=True)

    print("\n----------------------------------- AFTER -----------------------------------")
    print("Number of rows after removing duplicates: ", len(risk_factor_df))


This concluded the first phase of the preliminary analysis. After managing all the missing values and duplicate rows, the dataset had 34 features and 708 observations.

In [None]:
print("\nFinal dataset size: {} cols, {} rows".format(risk_factor_df.shape[1], risk_factor_df.shape[0]))

## Understanding features

Once the first part of the analysis was completed, we moved on to exploring the features and some statistical properties of the dataset. This would allow us to identify possible connections between the features as well as possible imbalances.

#### Unique Features

In [None]:
# Function finding the unique values of each column in the dataframe
def find_unique_values_df(feat: pd.DataFrame):
    return {col: feat[col].unique() for col in feat}

print("----------------------------------- Unique Values -----------------------------------")    
# Unique Values
unique_vals = find_unique_values_df(risk_factor_df)

# Print unique values for each column
for col, col_unique_vals in unique_vals.items():
    print(f"{col}:")
    print(col_unique_vals)
    print(risk_factor_df[col].dtypes)
    print()


### Target Values Distribution

First, we analyzed the dataset's balance. As shown in the graph, the dataset has a large imbalance across all four target variables. This imbalance complicates model training and evaluation, and it should be handled during the preprocessing step.

In [None]:
def getCount(col, value):
    return risk_factor_df[col].value_counts()[value]

# Plot occurrences of each class in the dataset
classes_df = pd.DataFrame(
    [["Hinselmann", getCount("Hinselmann", 0), getCount("Hinselmann", 1)],
        ["Schiller", getCount("Schiller", 0), getCount("Schiller", 1)],
        ["Citology", getCount("Citology", 0), getCount("Citology", 1)],
        ["Biopsy", getCount("Biopsy", 0), getCount("Biopsy", 1)]],
    columns =["Exam", "Healthy", "Cervical Cancer"])


balance_fig = px.histogram(classes_df, x="Exam", y=["Healthy", "Cervical Cancer"],
    title="Class Distribution",
    labels={
        "value":"Occurrences",
        "variable": "Result"
    },
    barmode="group",
    text_auto=True,
    color_discrete_sequence=px.colors.qualitative.Bold,
    width=600)

balance_fig.update_layout(
    title_x=0.5    
)

balance_fig.show()

### Statistical Properties

Moving on to the statistical properties of the dataset, we calculated the mean and standard deviation for each column. Columns with a standard deviation of 0 were omitted from the dataset because they did not add significant variability to the data since they contained the same value for all observations.

In [None]:
mean_df = risk_factor_df.mean()
std_df = risk_factor_df.std()

# Print columns that have a standard deviation 0 (contain only one value)
print("Columns containing 1 value: {}\n".format(std_df[std_df==0].index.values))


In [None]:
risk_factor_df.drop(columns=["STDs:cervical condylomatosis", "STDs:AIDS"], inplace=True)

In [None]:
# Plot
mean_df = risk_factor_df.mean()
std_df = risk_factor_df.std()

statistic_fig = go.Figure(data=[go.Table(
        header=dict(values=["Feature", "Mean", "Standard Deviation"]),
        cells=dict(values=[list(risk_factor_df.columns), mean_df.values, std_df.values],
                    align=['left', 'center'],
                    format=["",".2"])
    )
])

statistic_fig.show()

#### Remove Outliers

IQR (Inter Quartile Range) Inter Quartile Range approach to finding the outliers is the most commonly used and most trusted approach used in the research field. We utilised IQR to identify and remove outliers.

In [None]:
def find_outliers(col, indices):
    obs = risk_factor_df[col].iloc[indices]
    unique_items, counts = np.unique(obs, return_counts=True)
    unique_items, counts = unique_items[::-1], counts[::-1]

    values_to_delete = unique_items[counts < 2 ]
    return values_to_delete

def delete_outliers(col, to_delete):
    if (to_delete.size != 0):
        rows_to_del = risk_factor_df.loc[risk_factor_df[col].isin(to_delete)].index.values.tolist()

        # Remove rows
        risk_factor_df.drop(rows_to_del, inplace=True)
        risk_factor_df.reset_index(drop=True, inplace=True)

# Identify non-binary columns
non_binary_cols = [col for col, vals in unique_vals.items() if len(vals) > 2]

for col in non_binary_cols:

    # IQR cannot be applied to columns with median 0
    if (risk_factor_df[col].median() != 0):

        # Plot values distribution
        out_dist = px.histogram(risk_factor_df, x=col,
            marginal="box",
            color_discrete_sequence= px.colors.sequential.thermal)
        out_dist.update_layout(bargap=0.2,
            width=700)
        out_dist.show()

        Q3, Q1 = np.percentile(risk_factor_df[col], [75 ,25])
        IQR = Q3-Q1

        upper = Q3+(1.5*IQR)
        lower = Q1-(1.5*IQR)

        print(col)
        print("median: {}, upper fence: {}, lower fence: {}".format(risk_factor_df[col].median(), upper, lower))

        #Delete one occurrence observations outside the upper fence as outliers
        upper_to_delete = find_outliers(col, np.where(risk_factor_df[col] > upper)[0])
        delete_outliers(col, upper_to_delete)

        
        #Delete one occurrence observations outside the lower fence as outliers
        lower_to_delete = find_outliers(col, np.where(risk_factor_df[col] < lower)[0])
        delete_outliers(col, lower_to_delete)


In [None]:
print("\nFinal dataset size: {} cols, {} rows".format(risk_factor_df.shape[1], risk_factor_df.shape[0]))

#### Correlation with label

Lastly we found the correlation between each of the features and each of the target variables.

In [None]:
def find_corr(target, col):
    return risk_factor_df[target].corr(risk_factor_df[col])

# Create dictionaries
target_variables = ["Hinselmann", "Schiller", "Citology", "Biopsy"]
correlations = {target: {} for target in target_variables}

# Calculate correlations
for target in target_variables:
    target_corr = risk_factor_df.iloc[:, :-4].corrwith(risk_factor_df[target])
    correlations[target] = dict(target_corr.abs().sort_values())
    
# Plot graphs
for target in correlations:
    target_df = pd.DataFrame.from_dict(correlations[target], orient="index", columns=["Correlation"])

    target_fig = px.bar(target_df, x="Correlation",
        orientation='h',
        title="Features & {} Correlations".format(target),
        labels={
            "index": "Features"
        },
        width=900, height=700)
    
    target_fig.update_layout(
        title_x=0.5    
    )
    
    target_fig.show()