# Thyroid Disease Data

In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

df = pd.read_csv("thyroidDF.csv")

## Data feature engineering

### Initital columns

These are all the columns present in the dataset:

1. age - age of the patient (int)
2. sex - sex patient identifies (str)
3. on_thyroxine - whether patient is on thyroxine (bool)
4. query on thyroxine - *whether patient is on thyroxine (bool)
5. on antithyroid meds - whether patient is on antithyroid meds (bool)
6. sick - whether patient is sick (bool)
7. pregnant - whether patient is pregnant (bool)
8. thyroid_surgery - whether patient has undergone thyroid surgery (bool)
9. I131_treatment - whether patient is undergoing I131 treatment (bool)
10. query_hypothyroid - whether patient believes they have hypothyroid (bool)
11. query_hyperthyroid - whether patient believes they have hyperthyroid (bool)
12. lithium - whether patient * lithium (bool)
13. goitre - whether patient has goitre (bool)
14. tumor - whether patient has tumor (bool)
15. hypopituitary - whether patient * hyperpituitary gland (float)
16. psych - whether patient * psych (bool)
17. TSH_measured - whether TSH was measured in the blood (bool)
18. TSH - TSH level in blood from lab work (float)
19. T3_measured - whether T3 was measured in the blood (bool)
20. T3 - T3 level in blood from lab work (float)
21. TT4_measured - whether TT4 was measured in the blood (bool)
22. TT4 - TT4 level in blood from lab work (float)
23. T4U_measured - whether T4U was measured in the blood (bool)
24. T4U - T4U level in blood from lab work (float)
25. FTI_measured - whether FTI was measured in the blood (bool)
26. FTI - FTI level in blood from lab work (float)
27. TBG_measured - whether TBG was measured in the blood (bool)
28. TBG - TBG level in blood from lab work (float)
29. referral_source - (str)
30. target - hyperthyroidism medical diagnosis (str)
31. patient_id - unique id of the patient (str)


### Targets from dataset

This are the initial target defined in df:
1. hyperthyroid conditions:
- A   hyperthyroid
- B   T3 toxic
- C   toxic goitre
- D   secondary toxic
  
2. hypothyroid conditions:
- E   hypothyroid
- F   primary hypothyroid
- G   compensated hypothyroid
- H   secondary hypothyroid

3. binding protein:
- I   increased binding protein
- J   decreased binding protein

4. general health:
- K   concurrent non-thyroidal illness

5. replacement therapy:
- L   consistent with replacement therapy
- M   underreplaced
- N   overreplaced

6. antithyroid treatment:
- O   antithyroid drugs
- P   I131 treatment
- Q   surgery

7. miscellaneous:
- R   discordant assay results
- S   elevated TBG
- T   elevated thyroid hormones

Since there is little data we are focusing on distinguishing patients with <b>hyperthyroid</b>, <b>hypothyroid</b> condtions or <b>none of these</b>.

There needs to be conversion to new <b>targets</b>.

In [2]:
negative_targets = ["-"]
hyperthyroid_targets = ["A", "B", "C", "D"]
hypothyroid_targets = ["E", "F", "G", "H"]

def convert_to_new_target(old_target):
    if any([x in old_target for x in negative_targets]):
        return "negative"
    if any([x in old_target for x in hyperthyroid_targets]):
        return "hyperthyroid"
    if any([x in old_target for x in hypothyroid_targets]):
        return "hypothyroid"
    return None

df["target"] = df["target"].map(convert_to_new_target)
df.dropna(subset = ['target'], inplace=True)

In [3]:
df["target"].value_counts()

target
negative        6771
hypothyroid      667
hyperthyroid     241
Name: count, dtype: int64

### Removing redundant columns
Columns that have artifial values and information about measurement that is repesented by empty value in associated column

In [4]:
rem_columns = [
    'TSH_measured',
    'T3_measured',
    'TT4_measured',
    'T4U_measured',
    'FTI_measured',
    'TBG_measured',
    'referral_source',
    'patient_id',
]
df.drop(rem_columns, axis=1 ,inplace=True)

In [5]:
df.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U',
       'FTI', 'TBG', 'target'],
      dtype='object')

### Data description

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,7679.0,77.640839,1293.909497,1.0,37.0,55.0,67.0,65526.0
TSH,6955.0,5.500684,25.978304,0.005,0.55,1.4,2.7,530.0
T3,5470.0,2.010773,0.818738,0.05,1.6,1.9,2.3,18.0
TT4,7325.0,105.497565,33.125317,2.0,87.0,103.0,121.0,430.0
T4U,6998.0,0.967297,0.164388,0.19,0.87,0.96,1.06,2.12
FTI,7005.0,110.941312,37.167537,1.4,93.0,108.0,125.0,839.0
TBG,259.0,22.955019,6.088392,0.1,20.0,23.0,27.0,45.0


Without deep analysis we can see there is a problem with age column since max is 65526

In [7]:
df.sort_values("age")[-7:][["age", "target"]]

Unnamed: 0,age,target
7355,97,negative
790,97,negative
7356,97,negative
2976,455,negative
5710,65511,negative
6392,65512,negative
8105,65526,negative


In [8]:
df = df[df['age'] < 100]

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,7675.0,52.013029,18.654684,1.0,37.0,55.0,67.0,97.0
TSH,6951.0,5.503416,25.985525,0.005,0.55,1.4,2.7,530.0
T3,5467.0,2.010633,0.818893,0.05,1.6,1.9,2.3,18.0
TT4,7321.0,105.490324,33.132392,2.0,87.0,103.0,121.0,430.0
T4U,6994.0,0.967268,0.16441,0.19,0.87,0.96,1.06,2.12
FTI,7001.0,110.937565,37.176408,1.4,93.0,108.0,125.0,839.0
TBG,259.0,22.955019,6.088392,0.1,20.0,23.0,27.0,45.0


In [10]:
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
sex,7421,2,F,5006
on_thyroxine,7675,2,f,6825
query_on_thyroxine,7675,2,f,7552
on_antithyroid_meds,7675,2,f,7583
sick,7675,2,f,7388
pregnant,7675,2,f,7635
thyroid_surgery,7675,2,f,7569
I131_treatment,7675,2,f,7534
query_hypothyroid,7675,2,f,7155
query_hyperthyroid,7675,2,f,7115


In [11]:
null_cols = df.columns[df.isnull().any()]
nulls_df = df.loc[:, null_cols]
nulls_df = pd.DataFrame(nulls_df.isna().sum()/df.shape[0]*100)
nulls_df = nulls_df.rename(columns={0: 'Missing Values %'})
nulls_df

Unnamed: 0,Missing Values %
sex,3.309446
TSH,9.433225
T3,28.76873
TT4,4.612378
T4U,8.872964
FTI,8.781759
TBG,96.625407


Drop TBG since almost whole column is missing

In [12]:
df.drop(["TBG"], axis=1 ,inplace=True)

In [13]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,target
0,29,F,f,f,f,f,f,f,f,t,...,f,f,f,f,0.3,,,,,negative
1,29,F,f,f,f,f,f,f,f,f,...,f,f,f,f,1.6,1.9,128.0,,,negative
2,41,F,f,f,f,f,f,f,f,f,...,f,f,f,f,,,,,,negative
3,36,F,f,f,f,f,f,f,f,f,...,f,f,f,f,,,,,,negative
5,60,F,f,f,f,f,f,f,f,f,...,f,f,f,f,,,,,,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9166,70,F,f,f,f,f,f,f,f,f,...,f,f,f,f,,,88.0,0.74,119.0,negative
9167,56,M,f,f,f,f,f,f,f,f,...,f,f,f,f,,,64.0,0.83,77.0,negative
9168,22,M,f,f,f,f,f,f,f,f,...,f,f,f,f,,,91.0,0.92,99.0,negative
9170,47,F,f,f,f,f,f,f,f,f,...,f,f,f,f,,,75.0,0.85,88.0,negative


In [14]:
df.replace('f', 0, inplace=True)
df.replace('t', 1, inplace=True)
df.replace('M', 0, inplace=True)
df.replace('F', 1, inplace=True)

target_map = {
    "negative": 0,
    "hyperthyroid": 1,
    "hypothyroid": 2,
}

df["target"] = df["target"].map(target_map)

In [15]:
null_cols = df.columns[df.isnull().any()]
nulls_df = df.loc[:, null_cols]
nulls_df = pd.DataFrame(nulls_df.isna().sum()/df.shape[0]*100)
nulls_df = nulls_df.rename(columns={0: 'Missing Values %'})
nulls_df

Unnamed: 0,Missing Values %
sex,3.309446
TSH,9.433225
T3,28.76873
TT4,4.612378
T4U,8.872964
FTI,8.781759


In [16]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,target
0,29,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0.3,,,,,0
1,29,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1.6,1.9,128.0,,,0
2,41,1,0,0,0,0,0,0,0,0,...,0,0,0,0,,,,,,0
3,36,1,0,0,0,0,0,0,0,0,...,0,0,0,0,,,,,,0
5,60,1,0,0,0,0,0,0,0,0,...,0,0,0,0,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9166,70,1,0,0,0,0,0,0,0,0,...,0,0,0,0,,,88.0,0.74,119.0,0
9167,56,0,0,0,0,0,0,0,0,0,...,0,0,0,0,,,64.0,0.83,77.0,0
9168,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,,,91.0,0.92,99.0,0
9170,47,1,0,0,0,0,0,0,0,0,...,0,0,0,0,,,75.0,0.85,88.0,0


In [19]:
null_cols = df.columns[df.isnull().any()]
nulls_df = df.loc[:, null_cols]
nulls_df = pd.DataFrame(nulls_df.isna().sum()/df.shape[0]*100)
nulls_df = nulls_df.rename(columns={0: 'Missing Values %'})
nulls_df

Unnamed: 0,Missing Values %
sex,3.309446
TSH,9.433225
T3,28.76873
TT4,4.612378
T4U,8.872964
FTI,8.781759


In [41]:
from functools import lru_cache 

def get_impute_function(df, col, target):
    match col:
        case ["age", "TSH", "T3", "TT4", "T4U", "FTI", "target"]:
            median_for_target = df.loc[df['target'] == target][col].median()
            return lambda: median_for_target
        case _:
            return lambda: -12312

def imputed_df(df):
    null_cols = df.columns[df.isnull().any()]
    for null_col in null_cols:
        print(f"Imputing column: {null_col}")
        
        df[null_col] = df.apply(lambda x: get_impute_function(df, x[null_col], x["target"]))
    return df

get_impute_function(df, "TSH", 1)()

-12312