In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the metadata

In [None]:
metadata = pd.read_excel('./data/metadata.xlsx', skiprows=1)
metadata.head()

# 2. Load the metadata

In [None]:
selected_columns = ['Case Number', 'CaseID', 'Aceto uptake', 'Margins', 'Vessels', 'Lesion size', 'Iodine uptake', 'SwedeFinal']
selected_metadata = metadata[selected_columns]
selected_metadata.head()

# 3. Exploring metadata

In [None]:
selected_metadata.isnull().sum()

In [None]:
# Filling missing categorical values.
categorical_columns = ['Aceto uptake', 'Margins', 'Vessels', 'Lesion size', 'Iodine uptake']
for col in categorical_columns:
    mode_value = selected_metadata[col].mode()[0]
    selected_metadata.fillna({ col: mode_value }, inplace=True)

selected_metadata.isnull().sum()

In [None]:
# Define mappings according to SwedeScore interpretation table.
aceto_mapping = {
    'Nil or transparent': 0,
    'Thin, milky': 1,
    'Distinct, stearin': 2
}

margins_mapping = {
    'Nil or diffuse': 0,
    'Sharp but irregular, jagged, satellites': 1,
    'Sharp and even, difference in level': 2
}

vessels_mapping = {
    'Absent': 1,
    'Fine, regular': 0,
    'Coarse or atypical vessels': 2
}

lesion_size_mapping = {
    '< 5 mm': 0,
    '5-15 mm or 2 quadrants': 1,
    '>15 mm, 3-4 quadrants, or endocervically undefined': 2
}

iodine_mapping = {
    'Brown': 0,
    'Faintly or patchy yellow': 1,
    'Distinctly yellow': 2,
    'Unknown': np.nan,
}

# Apply mappings to convert categorical variables to numerical
selected_metadata['Aceto uptake'] = selected_metadata['Aceto uptake'].map(aceto_mapping)
selected_metadata['Margins'] = selected_metadata['Margins'].map(margins_mapping)
selected_metadata['Vessels'] = selected_metadata['Vessels'].map(vessels_mapping)
selected_metadata['Lesion size'] = selected_metadata['Lesion size'].map(lesion_size_mapping)
selected_metadata['Iodine uptake'] = selected_metadata['Iodine uptake'].map(iodine_mapping)

In [None]:
## Fill missing 'Iodine uptake' with the mode
mode_iodine = selected_metadata['Iodine uptake'].mode()[0]
selected_metadata.fillna({ 'Iodine uptake': mode_iodine }, inplace=True)

In [None]:
selected_metadata.head()

In [None]:
# Identify invalid SwedeFinal values
invalid_indices = selected_metadata[(selected_metadata['SwedeFinal'] < 0) | (selected_metadata['SwedeFinal'] > 10)].index
print(f"Number of invalid SwedeFinal entries: {len(invalid_indices)}")

In [None]:
class_counts = metadata['SwedeFinal'].value_counts()
# Plot histogram
plt.figure(figsize=(10, 6))
sns.countplot(data=selected_metadata, x='SwedeFinal', order=class_counts.index)
plt.title('Distribution of SwedeFinal Values (Before Fixing)')
plt.xlabel('SwedeFinal Score')
plt.ylabel('Count')
plt.show()

In [None]:
# List of columns contributing to SwedeFinal score
score_columns = ['Aceto uptake', 'Margins', 'Vessels', 'Lesion size', 'Iodine uptake']

# Recalculate SwedeFinal for invalid entries
selected_metadata.loc[invalid_indices, 'SwedeFinal'] = selected_metadata.loc[invalid_indices, score_columns].sum(axis=1)

In [None]:
invalid_indices_after = selected_metadata[(selected_metadata['SwedeFinal'] < 0) | (selected_metadata['SwedeFinal'] > 10)].index
print(f"Number of invalid SwedeFinal entries after recalculation: {len(invalid_indices_after)}")

In [None]:
class_counts = metadata['SwedeFinal'].value_counts()
# Plot histogram
plt.figure(figsize=(10, 6))
sns.countplot(data=selected_metadata, x='SwedeFinal', order=class_counts.index)
plt.title('Distribution of SwedeFinal Values (Before Fixing)')
plt.xlabel('SwedeFinal Score')
plt.ylabel('Count')
plt.show()

In [None]:
# Save data frame for further usage.
selected_metadata.to_csv('./data/source.csv');