In [1]:
import pandas as pd
import numpy as np
import numba as nb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
column_names = pd.read_csv("allhypo-edit.names.txt")

In [3]:
list(column_names)

['age',
 ' sex',
 ' on thyroxine',
 ' query on thyroxine',
 ' on antithyroid medication',
 ' sick',
 ' pregnant',
 ' thyroid surgery',
 ' I131 treatment',
 ' query hypothyroid',
 ' query hyperthyroid',
 ' lithium',
 ' goitre',
 ' tumor',
 ' hypopituitary',
 ' psych',
 ' TSH measured',
 ' TSH',
 ' T3 measured',
 ' T3',
 ' TT4 measured',
 ' TT4',
 ' T4U measured',
 ' T4U',
 ' FTI measured',
 ' FTI',
 ' TBG measured',
 ' TBG',
 ' referral source',
 ' classes']

In [4]:
names = ['age',
 'sex',
 'on thyroxine',
 'query on thyroxine',
 'on antithyroid medication',
 'sick',
 'pregnant',
 'thyroid surgery',
 'I131 treatment',
 'query hypothyroid',
 'query hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'TSH measured',
 'TSH',
 'T3 measured',
 'T3',
 'TT4 measured',
 'TT4',
 'T4U measured',
 'T4U',
 'FTI measured',
 'FTI',
 'TBG measured',
 'TBG',
 'referral source',
 'classes']

In [5]:
data = pd.read_csv("allhypo.data.txt", header=None, names=names)

In [6]:
data.columns = names

In [7]:
data.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,classes
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,negative.|3733
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,negative.|1442
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,negative.|2965
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,negative.|806
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,negative.|2807


In [8]:
hypotest = pd.read_csv("allhypo.test.txt",header=None, names=names)

In [9]:
hypotest.columns = names

In [10]:
hypotest.head(5)

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,classes
0,35,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,f,?,other,negative.|219
1,63,M,f,f,f,f,f,f,f,f,...,t,108,t,0.96,t,113,f,?,SVI,negative.|2059
2,25,F,f,f,f,f,f,f,f,f,...,t,61,t,0.82,t,75,f,?,SVHD,negative.|399
3,53,F,f,f,f,f,f,f,f,t,...,t,145,t,1.03,t,141,f,?,other,negative.|1911
4,92,F,f,f,f,f,f,f,f,f,...,t,120,t,0.84,t,143,f,?,SVI,negative.|487


Split "classes" on ".|" to get class label 

In [11]:
 data_class = data["classes"].str.split('.\|', 1, expand=True).rename(columns={0:'classes', 1:'id?'})

In [12]:
data_class.head()

Unnamed: 0,classes,id?
0,negative,3733
1,negative,1442
2,negative,2965
3,negative,806
4,negative,2807


In [13]:
data['classes'] = data_class['classes']

In [14]:
data.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,classes
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,negative
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,negative
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,negative
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,negative


In [15]:
data.to_csv("data.csv", encoding='utf-8', index=False)

In [16]:
# split "classes" on ".|" to get class label on test data too
test_class = hypotest["classes"].str.split('.\|', 1, expand=True).rename(columns={0:'classes', 1:'id?'})

In [17]:
test_class.head()

Unnamed: 0,classes,id?
0,negative,219
1,negative,2059
2,negative,399
3,negative,1911
4,negative,487


In [18]:
hypotest['classes'] = test_class['classes']

In [19]:
hypotest.head(3)

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,classes
0,35,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,f,?,other,negative
1,63,M,f,f,f,f,f,f,f,f,...,t,108,t,0.96,t,113,f,?,SVI,negative
2,25,F,f,f,f,f,f,f,f,f,...,t,61,t,0.82,t,75,f,?,SVHD,negative


In [20]:
hypotest.to_csv("data_test.csv", encoding='utf-8', index=False)

In [21]:
# 'TBG' contains no value in all rows
data['TBG'].describe()

count     2800
unique       1
top          ?
freq      2800
Name: TBG, dtype: object

In [22]:
# 'TBG_measured' contains f in all rows
data['TBG measured'].describe()

count     2800
unique       1
top          f
freq      2800
Name: TBG measured, dtype: object

In [23]:
# drop 'TBG' and TBG_measured' columns. 
data.drop(labels=['TBG', 'TBG measured'], axis=1, index=None, columns=None, level=None, inplace=True, errors='raise')

In [24]:
data.head(2)

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,referral source,classes
0,41,F,f,f,f,f,f,f,f,f,...,t,2.5,t,125,t,1.14,t,109,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,...,t,2.0,t,102,f,?,f,?,other,negative


In [25]:
# 'FTI' column has 295 missing value
data['FTI'].describe()

count     2800
unique     210
top          ?
freq       295
Name: FTI, dtype: object

About 10% "FTI" data are missing. Fill the missing data with mean value. Other numeric columns also have relatively small % of missing values

In [26]:
# create dictionary for replacing missing value as np.nan, "f" as 0, "t" as 1, "F" as 0, "M" as 1
repl = {"?": np.nan, "f":0, "t": 1, "F": 0, "M":1}

In [27]:
# replacing values as mentioned above
data.replace(to_replace=repl, value=None, inplace=True, limit=None, regex=False, method='pad')

In [28]:
data.head(9).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8
age,41,23,46,70,70,18,59,80,66
sex,0,0,1,0,0,0,0,0,0
on thyroxine,0,0,0,1,0,1,0,0,0
query on thyroxine,0,0,0,0,0,0,0,0,0
on antithyroid medication,0,0,0,0,0,0,0,0,0
sick,0,0,0,0,0,0,0,0,0
pregnant,0,0,0,0,0,0,0,0,0
thyroid surgery,0,0,0,0,0,0,0,0,0
I131 treatment,0,0,0,0,0,0,0,0,0
query hypothyroid,0,0,0,0,0,0,0,0,0


In [29]:
rename_class_labels = {'negative': 0, 'primary hypothyroid': 1,
      'compensated hypothyroid':2, 'secondary hypothyroid': 3 }

In [30]:
data['classes'].replace(to_replace=rename_class_labels, inplace=True)

In [31]:
# change numeric data type from string to float
tofloat_columns = ["age", "sex", "TSH", "T3", "TT4", "T4U", "FTI"]
data[tofloat_columns] = data[tofloat_columns].astype(np.float32)

In [32]:
fillna_columns = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

In [33]:
fillna_ave_nums = np.around(data[fillna_columns].mean(), 3)

In [34]:
fillna_ave_nums

age     51.844002
TSH      4.672000
T3       2.025000
TT4    109.071999
T4U      0.998000
FTI    110.788002
dtype: float32

In [35]:
for i in fillna_columns: 
    data.loc[np.isnan(data[i]), i] = fillna_ave_nums[i]

In [36]:
data.loc[np.isnan(data['T3']), "T3"] = fillna_ave_nums['T3']

In [37]:
data.head(10)

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,referral source,classes
0,41.0,0.0,0,0,0,0,0,0,0,0,...,1,2.5,1,125.0,1,1.14,1,109.0,SVHC,0
1,23.0,0.0,0,0,0,0,0,0,0,0,...,1,2.0,1,102.0,0,0.998,0,110.788002,other,0
2,46.0,1.0,0,0,0,0,0,0,0,0,...,0,2.025,1,109.0,1,0.91,1,120.0,other,0
3,70.0,0.0,1,0,0,0,0,0,0,0,...,1,1.9,1,175.0,0,0.998,0,110.788002,other,0
4,70.0,0.0,0,0,0,0,0,0,0,0,...,1,1.2,1,61.0,1,0.87,1,70.0,SVI,0
5,18.0,0.0,1,0,0,0,0,0,0,0,...,0,2.025,1,183.0,1,1.3,1,141.0,other,0
6,59.0,0.0,0,0,0,0,0,0,0,0,...,0,2.025,1,72.0,1,0.92,1,78.0,other,0
7,80.0,0.0,0,0,0,0,0,0,0,0,...,1,0.6,1,80.0,1,0.7,1,115.0,SVI,0
8,66.0,0.0,0,0,0,0,0,0,0,0,...,1,2.2,1,123.0,1,0.93,1,132.0,SVI,0
9,68.0,1.0,0,0,0,0,0,0,0,0,...,1,1.6,1,83.0,1,0.89,1,93.0,SVI,0


In [39]:
hypotest.drop(labels=['TBG', 'TBG measured'], axis=1, index=None, columns=None, level=None, inplace=True, errors='raise')

In [40]:
hypotest.replace(to_replace=repl, value=None, inplace=True, limit=None, regex=False, method='pad')

In [41]:
hypotest[fillna_columns] = hypotest[fillna_columns].astype(np.float32)

In [42]:
for i in fillna_columns: 
    hypotest.loc[np.isnan(hypotest[i]), i] = fillna_ave_nums[i]

In [43]:
hypotest.head().transpose()

Unnamed: 0,0,1,2,3,4
age,35,63,25,53,92
sex,0,1,0,0,0
on thyroxine,0,0,0,0,0
query on thyroxine,0,0,0,0,0
on antithyroid medication,0,0,0,0,0
sick,0,0,0,0,0
pregnant,0,0,0,0,0
thyroid surgery,0,0,0,0,0
I131 treatment,0,0,0,0,0
query hypothyroid,0,0,0,1,0


In [44]:
data['referral source'].unique()

array(['SVHC', 'other', 'SVI', 'STMW', 'SVHD'], dtype=object)

In [45]:
hypotest['referral source'].unique()

array(['other', 'SVI', 'SVHD', 'SVHC', 'STMW'], dtype=object)

In [46]:
ref_source = {'other': 0, 'SVI': 1, 'SVHD': 2, 'SVHC': 3, 'STMW': 4}

In [47]:
data['referral source'].replace(to_replace=ref_source, value=None, inplace=True, limit=None, regex=False, method='pad')

In [48]:
hypotest['referral source'].replace(to_replace=ref_source, value=None, inplace=True, limit=None, regex=False, method='pad')

In [49]:
data.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,referral source,classes
0,41.0,0.0,0,0,0,0,0,0,0,0,...,1,2.5,1,125.0,1,1.14,1,109.0,3,0
1,23.0,0.0,0,0,0,0,0,0,0,0,...,1,2.0,1,102.0,0,0.998,0,110.788002,0,0
2,46.0,1.0,0,0,0,0,0,0,0,0,...,0,2.025,1,109.0,1,0.91,1,120.0,0,0
3,70.0,0.0,1,0,0,0,0,0,0,0,...,1,1.9,1,175.0,0,0.998,0,110.788002,0,0
4,70.0,0.0,0,0,0,0,0,0,0,0,...,1,1.2,1,61.0,1,0.87,1,70.0,1,0


In [50]:
data.to_csv("data_clean.csv", encoding='utf-8', index=False)

In [51]:
hypotest.to_csv("data_test_clean.csv", encoding='utf-8', index=False)

In [52]:
pd.read_csv("data_clean.csv", encoding='utf-8').head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,referral source,classes
0,41.0,0.0,0,0,0,0,0,0,0,0,...,1,2.5,1,125.0,1,1.14,1,109.0,3,0
1,23.0,0.0,0,0,0,0,0,0,0,0,...,1,2.0,1,102.0,0,0.998,0,110.788,0,0
2,46.0,1.0,0,0,0,0,0,0,0,0,...,0,2.025,1,109.0,1,0.91,1,120.0,0,0
3,70.0,0.0,1,0,0,0,0,0,0,0,...,1,1.9,1,175.0,0,0.998,0,110.788,0,0
4,70.0,0.0,0,0,0,0,0,0,0,0,...,1,1.2,1,61.0,1,0.87,1,70.0,1,0


In [53]:
pd.read_csv("data_test_clean.csv", encoding='utf-8').head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,referral source,classes
0,35.0,0.0,0,0,0,0,0,0,0,0,...,0,2.025,0,109.072,0,0.998,0,110.788,0,negative
1,63.0,1.0,0,0,0,0,0,0,0,0,...,1,2.5,1,108.0,1,0.96,1,113.0,1,negative
2,25.0,0.0,0,0,0,0,0,0,0,0,...,1,2.4,1,61.0,1,0.82,1,75.0,2,negative
3,53.0,0.0,0,0,0,0,0,0,0,1,...,1,2.1,1,145.0,1,1.03,1,141.0,0,negative
4,92.0,0.0,0,0,0,0,0,0,0,0,...,1,1.3,1,120.0,1,0.84,1,143.0,1,negative
