MAI643 - Artificial Intelligence in Medicine

Project Assignment 1 - Spring Semester 2024

Student Name:    
Christina Ioanna Saroglaki   
Jianlin Ye 

UCY Email:     
saroglaki.christina-ioanna@ucy.ac.cy    
jye00001@ucy.ac.cy 

### Import Libararies

In [1]:
import pandas as pd 
import numpy as np

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import StratifiedShuffleSplit

np.set_printoptions(formatter={'float':"{:6.5g}".format})

### Import dataset

In [2]:
risk_factor_df = pd.read_csv("risk_factors_cervical_cancer.csv")

# Only keep the "Biopsy" column as the target variable
risk_factor_df = risk_factor_df.drop(columns=["Hinselmann","Schiller","Citology"])

Tranformed all the numeric values into  the correct numeric type.

In [3]:
risk_factor_df = risk_factor_df.apply(pd.to_numeric, errors = "coerce")

Split dataset to features and target variables.

In [4]:
feature_df = risk_factor_df.iloc[:,:-4]
dep_df = risk_factor_df.iloc[:,-4:]

### Handling missing values

During the preliminary analysis identified that the features “STDs: Time since first diagnosis” and “STDs: Time since last diagnosis” were filled with NaN values to about 92%. Because of the high percentage of missing values, it impractical to either eliminate those observations or fill the missing data with the mean of the existing data. Consequently, these features were excluded from the dataset for the development of the models.

In [5]:
risk_factor_df = risk_factor_df.drop(columns=["STDs: Time since first diagnosis", "STDs: Time since last diagnosis"])

For the remaining columns, we managed the missing values depending on the column. In more detail, if the column contained binary values (0,1) then the row containing the missing value was deleted. Otherwise, the missing value was replaced with the mean of the column.

In [7]:
print("--------------------------- Handling Missing Values ---------------------------")
print("----------------------------------- BEFORE -----------------------------------")
print("Number of rows before filling missing values: ", len(risk_factor_df))

# Display the number of missing values before filling
print("\nNumber of missing values per column before filling:")
print(risk_factor_df.isnull().sum())

# Fill missing values depending on the column
for col in risk_factor_df.columns:
    # If the column has more than 3 unique values, fill with mean of the column
    if risk_factor_df[col].nunique() > 3:
        risk_factor_df[col].fillna(risk_factor_df[col].median(), inplace=True)
    
# Drop rest NaN containing rows
risk_factor_df=risk_factor_df.dropna()
risk_factor_df.reset_index(drop=True, inplace=True)

--------------------------- Handling Missing Values ---------------------------
----------------------------------- BEFORE -----------------------------------
Number of rows before filling missing values:  726

Number of missing values per column before filling:
Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic infla

In [8]:
print("\n----------------------------------- AFTER -----------------------------------")
print("Number of rows after filling missing values: ", len(risk_factor_df))

# Display the number of missing values after filling
print("\nNumber of missing values per column after filling:")
print(risk_factor_df.isnull().sum())


----------------------------------- AFTER -----------------------------------
Number of rows after filling missing values:  726

Number of missing values per column after filling:
Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum cont

### Duplicate Rows

Removing duplicate rows from the dataset.

In [9]:
print("----------------------------------- Duplicate Rows -----------------------------------")
# Check for duplicate rows
duplicate_rows = risk_factor_df.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicate_rows.sum()

if num_duplicates == 0:
    print("No duplicate rows found in the dataset.")
else:
    print(f"Found {num_duplicates} duplicate rows in the dataset.\n")

    # Display the duplicate rows indexes (if any)
    print("Duplicate rows indexes: {}\n".format(risk_factor_df[duplicate_rows].index.values))

    # Removing duplicate rows
    print("----------------------------- Removing Duplicates ----------------------------")
    print("----------------------------------- BEFORE -----------------------------------")
    print("Number of rows before removing duplicates: ", len(risk_factor_df))

    risk_factor_df.drop_duplicates(inplace=True)
    risk_factor_df.reset_index(drop=True, inplace=True)

    print("\n----------------------------------- AFTER -----------------------------------")
    print("Number of rows after removing duplicates: ", len(risk_factor_df))


----------------------------------- Duplicate Rows -----------------------------------
Found 21 duplicate rows in the dataset.

Duplicate rows indexes: [ 63 222 296 332 340 344 360 364 368 370 377 380 387 405 410 441 447 480
 484 536 607]

----------------------------- Removing Duplicates ----------------------------
----------------------------------- BEFORE -----------------------------------
Number of rows before removing duplicates:  726

----------------------------------- AFTER -----------------------------------
Number of rows after removing duplicates:  705


#### Remove Outliers

IQR (Inter Quartile Range) Inter Quartile Range approach to finding the outliers is the most commonly used and most trusted approach used in the research field. We utilised IQR to identify and remove outliers.

#### define Unique Features

In [10]:
# Function finding the unique values of each column in the dataframe
def find_unique_values_df(feat: pd.DataFrame):
    return {col: feat[col].unique() for col in feat}

print("----------------------------------- Unique Values -----------------------------------")    
# Unique Values
unique_vals = find_unique_values_df(risk_factor_df)

# Print unique values for each column
for col, col_unique_vals in unique_vals.items():
    print(f"{col}:")
    print(col_unique_vals)
    print(risk_factor_df[col].dtypes)
    print()


----------------------------------- Unique Values -----------------------------------
Age:
[18 15 34 52 46 42 51 26 45 44 27 43 40 41 39 37 38 36 35 33 31 32 30 23
 28 29 25 21 24 22 20 48 19 17 16 14 59 79 84 47 13 70 50 49]
int64

Number of sexual partners:
[     4      1      5      3      2      6      8      7     28      9]
float64

First sexual intercourse:
[    15     14     17     16     21     23     26     20     25     18
     27     19     24     32     29     11     13     22     28     10
     12]
float64

Num of pregnancies:
[     1      4      2      6      3      5      8      7      0     11
     10]
float64

Smokes:
[     0      1]
float64

Smokes (years):
[     0     37     34      3  1.267     12     18      7     19     21
     13     16     15      8      4     10     22     14    0.5     11
      2      6      5      1     32      9     24     28     20   0.16]
float64

Smokes (packs/year):
[     0     37    3.4   0.04 0.5132    2.4      6      9    1.6     19


In [11]:
def find_outliers(col, indices):
    obs = risk_factor_df[col].iloc[indices]
    unique_items, counts = np.unique(obs, return_counts=True)
    unique_items, counts = unique_items[::-1], counts[::-1]

    values_to_delete = unique_items[counts < 2 ]
    return values_to_delete

def delete_outliers(col, to_delete):
    if (to_delete.size != 0):
        rows_to_del = risk_factor_df.loc[risk_factor_df[col].isin(to_delete)].index.values.tolist()

        # Remove rows
        risk_factor_df.drop(rows_to_del, inplace=True)
        risk_factor_df.reset_index(drop=True, inplace=True)

# Identify non-binary columns
non_binary_cols = [col for col, vals in unique_vals.items() if len(vals) > 2]

for col in non_binary_cols:

    # IQR cannot be applied to columns with median 0
    if (risk_factor_df[col].median() != 0):

        # Plot values distribution
        out_dist = px.histogram(risk_factor_df, x=col,
            marginal="box",
            color_discrete_sequence= px.colors.sequential.thermal)
        out_dist.update_layout(bargap=0.2,
            width=700)
        out_dist.show()

        Q3, Q1 = np.percentile(risk_factor_df[col], [75 ,25])
        IQR = Q3-Q1

        upper = Q3+(1.5*IQR)
        lower = Q1-(1.5*IQR)

        print(col)
        print("median: {}, upper fence: {}, lower fence: {}".format(risk_factor_df[col].median(), upper, lower))

        #Delete one occurrence observations outside the upper fence as outliers
        upper_to_delete = find_outliers(col, np.where(risk_factor_df[col] > upper)[0])
        delete_outliers(col, upper_to_delete)

        
        #Delete one occurrence observations outside the lower fence as outliers
        lower_to_delete = find_outliers(col, np.where(risk_factor_df[col] < lower)[0])
        delete_outliers(col, lower_to_delete)


Age
median: 26.0, upper fence: 51.0, lower fence: 3.0


Number of sexual partners
median: 2.0, upper fence: 4.5, lower fence: 0.5


First sexual intercourse
median: 17.0, upper fence: 22.5, lower fence: 10.5


Num of pregnancies
median: 2.0, upper fence: 6.0, lower fence: -2.0


Hormonal Contraceptives (years)
median: 0.5, upper fence: 7.5, lower fence: -4.5


In [13]:
print("\nFinal dataset size: {} cols, {} rows".format(risk_factor_df.shape[1], risk_factor_df.shape[0]))


Final dataset size: 31 cols, 694 rows


#### Dimensionality Reduction (PCA)

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the feature matrix
scaler = StandardScaler()
feature_std = scaler.fit_transform(feature_df)


# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
feature_std_imputed = imputer.fit_transform(feature_std)

# Apply PCA on the imputed feature matrix
pca = PCA()
principal_components = pca.fit_transform(feature_std_imputed)

# Calculate the explained variance ratio for each principal component
explained_variance_ratio = pca.explained_variance_ratio_

# Plot the explained variance ratio
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(1, len(explained_variance_ratio) + 1), 
                         y=np.cumsum(explained_variance_ratio),
                         mode='lines+markers',
                         name='Explained Variance Ratio'))

fig.update_layout(title='Cumulative Explained Variance Ratio',
                  xaxis_title='Number of Principal Components',
                  yaxis_title='Cumulative Explained Variance Ratio',
                  showlegend=True)

fig.show()


#### Handling Imbalanced Data

In [18]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import RandomOverSampler

In [19]:
dx_cancer = px.histogram(risk_factor_df, y="Dx:Cancer")
dx_cancer.update_layout(bargap=0.2)
dx_cancer.update_layout(title = "Imbalanced Classes")
dx_cancer.show()

In [None]:
X = risk_factor_df.drop(["Dx:Cancer", "age_cat"], axis=1)
y = risk_factor_df["Dx:Cancer"].copy()

In [None]:
adasyn = ADASYN(random_state=42)
x_adasyn,y_adasyn = adasyn.fit_resample(x,y)
risk_factor_df = x_adasyn.join(y_adasyn)

In [None]:
risk_factor_df["age_cat"] = risk_factor_df["Age"].apply(age_cat)

In [None]:
dx_cancer = px.histogram(risk_factor_df, y="Dx:Cancer")
dx_cancer.update_layout(bargap=0.2)
dx_cancer.update_layout(title = "Balanced Classes")
dx_cancer.show()