In [1]:
import os
import copy
import math

import pandas as pd
from tqdm.notebook import tqdm
from ydata_profiling import ProfileReport
import plotly.express as px

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Problem Exploration

Doctors have the complicated task of identifying disease(s) a patient may have from information they can collect. This information can come in the form of:

- Reported symptoms from patients (from a clinical interview).
- Clinical history.
- Conducting a physical exam.
- Conducting diagnostic tests:
  - Biopsy.
  - Colonscopy.
  - CT scan.
  - Electrocardiogram (ECG).
  - ...
- Consulting with other clinicians.

## Problem as a ML Optimisation Objective

**Primary Goal:** Given a One Hot Encoded vector (of symptoms) find the most likely disease they have.

Future Goal: Given a One Hot Encoded vector (of symptoms), produce a set of diseases (could be a set of size 1) that the patient is likely to have. This may require additional data (especially for evaluation).

## What does Success Look Like?

Today's CAD (Computer Aided Diagnosis) systems have been shown to achieve up to a 90% hit rate (sensitivity = TP / (TP + FN)). I.e. they get the correct prognosis of a patient with a disease up to 90% of the time.

Models that target a single disease tend to perform better and can achieve in the high 90s for most metrics (accuracy, sensitivity, recall, F1-score etc).

Models that have been used include:
- KNN
- ANN
- Decision Trees and Random Forests
- Genetic Algorithms
- Naive Bayes

Given the more limited dataset and compute power, if I can achieve sensitivity >= 80% I will consider that a success.

References:
- [Computer-aided diagnosis Wikipedia](https://en.wikipedia.org/wiki/Computer-aided_diagnosis#:~:text=Today's%20CAD%20systems%20cannot%20detect,a%20False%20Positive%20(FP))
- [Computer-aided diagnosis systems: a comparative study of classical machine learning versus deep learning-based approaches](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10205571/)

## Assumptions

- The patient has a disease in our set of diseases.
- All patients have a disease. There is no "no disease" prognosis.
- The patient is only experiencing symptoms in our set of symptoms.
- Making a diagnosis accurately only requires knowing if symptom(s) exist (binary exists or not) and not a metric of its extent/degree.

In [2]:
train_df = pd.read_csv("./dataset/original/Training.csv")
train_df.drop("Unnamed: 133", inplace=True, axis=1)

for col in train_df.columns[:-1]:
    train_df[col] = train_df[col].astype("bool")

train_df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Fungal infection
1,False,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Fungal infection
2,True,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Fungal infection
3,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Fungal infection
4,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Fungal infection


In [3]:
len(train_df)

4920

In [4]:
test_df = pd.read_csv("./dataset/original/Testing.csv")

for col in test_df.columns[:-1]:
    test_df[col] = test_df[col].astype("bool")

test_df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Fungal infection
1,False,False,False,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Allergy
2,False,False,False,False,False,False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,GERD
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Chronic cholestasis
4,True,True,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,Drug Reaction


In [5]:
len(test_df)

42

In [6]:
print("Number of unique diseases:", train_df['prognosis'].nunique())
print("Number of unique diseases:", test_df['prognosis'].nunique())
print("Non common prognosis:", set(train_df['prognosis']) ^ set(test_df['prognosis']))

Number of unique diseases: 41
Number of unique diseases: 41
Non common prognosis: set()


In [7]:
# count each prognosis
prognosis_count = train_df['prognosis'].value_counts()
prognosis_count

prognosis
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                                120
Hepatitis B                                120
All

In [8]:
print("All prognosises are equally distributed:", len(set(prognosis_count.to_list())) == 1)

All prognosises are equally distributed: True


In [9]:
print("Number of instances to move to test set per disease:", math.ceil(((4920 - 42)*0.2)/41))

Number of instances to move to test set per disease: 24


In [10]:
indexes_to_remove = []
sample_dfs = []

for prognosis in list(train_df['prognosis'].unique()):
    sample_df = train_df[train_df["prognosis"] == prognosis].sample(24, random_state=42)
    sample_dfs.append(sample_df)
    
    indexes_to_remove.extend(sample_df.index.to_list())

In [11]:
test_df = pd.concat([test_df] + sample_dfs)

In [12]:
train_df.drop(indexes_to_remove, inplace=True)

In [13]:
train_df.to_csv("./dataset/clean/train.csv", index=False)

In [14]:
test_df.to_csv("./dataset/clean/test.csv", index=False)

In [15]:
if not os.path.exists("symptoms-prognosis-datase-report.html"):
  profile = ProfileReport(train_df, title="Symptoms Prognosis Dataset Report", explorative=True)
  profile.to_file("symptoms-prognosis-datase-report.html")

# Model Building

## Feature Engineering

- Using a LLM to categorise symptoms into:
  - Diagnostic test required for symptoms (self-reporting, diagnostic test: ECG, CT Scan etc)
- Correlation of one symptom with another to try to find unindentified symptoms
  - This would be useful to know to predict the success of a Naive Bayes model

In [16]:
# verify if the columns are in the same order in the train and test dataset
for i, col in enumerate(train_df.columns):
    if col != test_df.columns[i]:
        print(col, test_df.columns[i])

In [17]:
class TwoKeyDict:
  def __init__(self) -> None:
    self.dictionary = {}
    
  def get_value(self, key1, key2):
    if (key1, key2) in self.dictionary:
      return self.dictionary[(key1, key2)]
    
    if (key2, key1) in self.dictionary:
      return self.dictionary[(key2, key1)]
    
    return None
  
  def add_key(self, key1, key2, value):
    if (key2, key1) in self.dictionary:
      self.dictionary[(key2, key1)] = value
    
    self.dictionary[(key1, key2)] = value

In [18]:
var_corr_df = None

if not os.path.exists("var_corr_df.csv"):
  var_to_var_corr = TwoKeyDict()
  cols = train_df.columns[:-1]

  for i, row in tqdm(train_df.iterrows(), total=len(train_df)):
      for j, col in enumerate(cols):
          for k in range(j+1, len(cols)):
              if var_to_var_corr.get_value(col, cols[k]) is None:
                var_to_var_corr.add_key(col, cols[k], int(row[col] == row[cols[k]]))
              else:
                var_to_var_corr.add_key(col, cols[k], var_to_var_corr.get_value(col, cols[k]) + int(row[col] == row[cols[k]]))
                
  var_to_var_corr_val = copy.deepcopy(var_to_var_corr.dictionary)

  for key in var_to_var_corr_val:
    var_to_var_corr_val[key] = var_to_var_corr_val[key] / len(train_df)
    
  var_corr_df = pd.DataFrame([[x[0][0], x[0][1], x[1]] for x in var_to_var_corr_val.items()], columns=["var1", "var2", "corr"])
  var_corr_df.to_csv("var_corr_df.csv", index=False)
else:
  var_corr_df = pd.read_csv("var_corr_df.csv")

  0%|          | 0/3936 [00:00<?, ?it/s]

In [19]:
fig = px.histogram(
    var_corr_df["corr"],
    x="corr",
    nbins=100,
)
fig.show()