In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from  xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from scipy import stats
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# Introduction
### In this notebook I want to study thyroid problems and I want to train and prepare some classifiers that could recognize any kind of thyroid problem in a patient.<br>To do this, I will use six datasets I got from -> https://archive.ics.uci.edu/ml/datasets/Thyroid+Disease:
* allhyperTestEDIT and allhyperTrainEDIT present classes'hyperthyroid','T3 toxic','goitre','secondary toxic' and 'negative'
* allhypoDATA and allhypoTEST present classes 'hypothyroid','primary hypothyroid','compensated hypothyroid','secondary hypothyroid' and 'negative'
* hypothyroid present classes 'hypothyroid' and 'negative'
* sick-euthyroid present classes 'sick-euthyroid' and 'negative'
* thyroid0387 present classes hyperthyroid conditions (A, B, C, D), hypothyroid conditions (E, F, G, H), binding protein (I, J), general health (K), replacement therapy (L, M, N), discordant results (R) 
* ann-test and ann-train present classes normal (not hypothyroid), hyperfunction and subnormal functioning

### <br> You can find the full documentation on the link above. I want to build a dataset, merging these six above, which present only three classes: hypothyroid, hyperthyroid and negative. Once this work is done, I will go on with the data pre processing and then I will train and test the classifiers.
# Part 1: Data Integration
### I have to integrate these six different datasets. I will start from the 'all' series because they have the same scheme. 
allHyperTest = pd.read_csv("../input/hypothyroid-multi-dataset/allhyperTestEDIT.CSV")
allHyperTrain = pd.read_csv("../input/hypothyroid-multi-dataset/allhyperTrainEDIT.CSV")
allHypoTest = pd.read_csv("../input/hypothyroid-multi-dataset/allhypoTEST.csv")
allHypoTrain = pd.read_csv("../input/hypothyroid-multi-dataset/allhypoDATA.CSV")

display(allHypoTest.head(10))
display(allHypoTrain.dtypes)
### ID is an identificator, so I have to check if there are any istances with the same value for this attribute:
def handleDuplicated(df):
    if df["ID"].duplicated().sum() == 0 :
        print("There aren't duplicates")
    elif (df["ID"].duplicated().sum()) < len(df) / 100:
        df["ID"].drop_duplicates(keep="first", inplace=True)
        print("duplicates were less than the 1% of all the data, they have been dropped")
    else:
        index_duplicated = df["ID"].duplicated().index
        print("duplicates are more than the 1% of all the data, they have been preserved")
        print(index_duplicated)

handleDuplicated(allHyperTest)
handleDuplicated(allHyperTrain)
handleDuplicated(allHypoTest)
handleDuplicated(allHypoTrain)
 ### Now it's possible to drop the ID attribute because it's useless for the classification:
del allHyperTest["ID"]
del allHyperTrain["ID"]
del allHypoTest["ID"]
del allHypoTrain["ID"]
### From these four datasets I will obtain all the istances presentig a class that's different from 'negative':
def notCorrect_TargetFilter(df,correct_Target,target):
    df = df[df.Target.isin(correct_Target)]
    df.replace(correct_Target,target,inplace = True)
    return df
    
allHyperTest = notCorrect_TargetFilter(allHyperTest,["hyperthyroid","T3_toxic","goitre","secondary_toxic"],"hyperthyroid")
allHyperTrain = notCorrect_TargetFilter(allHyperTrain,["hyperthyroid","T3_toxic","goitre","secondary_toxic"],"hyperthyroid")
allHypoTest = notCorrect_TargetFilter(allHypoTest,["hypothyroid", "primary_hypothyroid", "compensated_hypothyroid", "secondary_hypothyroid"],"hypothyroid")
allHypoTrain = notCorrect_TargetFilter(allHypoTrain,["hypothyroid", "primary_hypothyroid", "compensated_hypothyroid", "secondary_hypothyroid"],"hypothyroid")
### Now I will merge the four datasets:
allDataset = pd.concat([allHyperTest,allHyperTrain,allHypoTest,allHypoTrain], ignore_index = True)
display(allDataset.shape)
### That's all for the 'all' series. Let's go on with thyroid0387:
thyroid0387 = pd.read_csv("../input/hypothyroid-multi-dataset/thyroid0387EDIT.CSV")
display(thyroid0387.head(10))
display(thyroid0387.dtypes)
### We have the ID attribute here too, so:
handleDuplicated(thyroid0387)
del thyroid0387["ID"]
### This dataset has different interesting classes: A,B,C,D,E,F,G,H. All the others should be considered as 'negative'. I have to be careful because 'F' and 'M' are used in the 'sex' attribute too, so before any sostitution, I have to handle this problem:
thyroid0387['sex'] = thyroid0387['sex'].map({'F': 1, 'M': 0})

thyroid0387.replace(['A','B','C','D'],"hyperthyroid",inplace = True)
thyroid0387.replace(['E','F','G','H'],"hypothyroid",inplace = True)

for value in set(thyroid0387['Target']):
    if(value != 'hypothyroid' and value != 'hyperthyroid'):
        thyroid0387.replace(value,'negative',inplace=True)
### Let's continue with the 'hypothyroid' dataset:
hypothyroid = pd.read_csv("../input/hypothyroid-multi-dataset/hypothyroid.csv")
display(hypothyroid.shape)
display(hypothyroid.head(10))
display(hypothyroid.dtypes)
### The 'Unnamed' attribute indicate the class of the istance, so I have to rename it. Then I will filter the 'hypothyroid' class istances. For this dataset I don't have 'I131_treatment', 'hypopituitary', 'psych' and 'referral_source' attributes.
hypothyroid = hypothyroid.rename(columns={hypothyroid.columns[0]:"Target",hypothyroid.columns[1]:"age",hypothyroid.columns[2]:"sex" })
hypothyroid = hypothyroid[hypothyroid.Target.isin(['hypothyroid'])]
### For 'sick-euthyroid' I have to filter all the 'negative' istances:
sick_euthyroid = pd.read_csv("../input/hypothyroid-multi-dataset/sick-euthyroid.CSV")
display(sick_euthyroid.shape)
display(sick_euthyroid.head(10))
display(sick_euthyroid.dtypes)
### For this dataset I don't have 'I131_treatment', 'hypopituitary', 'psych' and 'referral_source' attributes.
sick_euthyroid = sick_euthyroid[sick_euthyroid.Target.isin(['negative'])]
display(sick_euthyroid.shape)
### Now it's time to work on the "ann" series:
ann_train = pd.read_csv("../input/hypothyroid-multi-dataset/ann-train.CSV")
ann_test = pd.read_csv("../input/hypothyroid-multi-dataset/ann-test.CSV")
display(ann_test.head(10))
display(ann_test.dtypes)
### I don't have 'measured' attributes, the 'TBG' and the 'referral_source' attributes. I should create the 'measured' attributes basing on the other columns.
target1 = pd.Series(ann_test[ann_test.columns[-1]].values)
display(target1.value_counts())
target2 = pd.Series(ann_train[ann_train.columns[-1]].values)
display(target2.value_counts())
### Looking at the distribuition of the values for the 'Target' attribute, we can understand that:
* 3 is referring to the 'negative' class
* 2 is referring to the 'hypothyroid' class
* 1 is referring to the 'hyperthyroid' class

### I should analyze the distribuition of the sex attribute in the other datasets to understand how I should treat it in the 'ann' series:
print("Sex thyroid0387 1=F,0=M:")
sex_series1 = pd.Series(thyroid0387[thyroid0387.columns[1]].values)
display(sex_series1.value_counts())
print("Sick-euthyroid:")
sex_series2 = pd.Series(sick_euthyroid[sick_euthyroid.columns[2]].values)
display(sex_series2.value_counts())
### So, there are more female than male patients in these datasets. Looking at the "ann" series I got:
sex1 = pd.Series(ann_test[ann_test.columns[1]].values)
display(sex1.value_counts())
sex2 = pd.Series(ann_train[ann_train.columns[1]].values)
display(sex2.value_counts())
### I can assume that '0' refers to female patients and '1' refers to male patients. Another important things to do is to multply for 100 all the continuos and numerical attributes and to add the 'measured' attributes.
for column in ann_train.columns:
    listOfValues=set(ann_train[column])
    print(column,": ",listOfValues)
ann = pd.concat([ann_train,ann_test], ignore_index = True)
ann['sex'] = ann['sex'].map({0:'F',1:'M'})
ann['Target'] = ann['Target'].map({3:'negative',2:'hypothyroid',1:'hyperthyroid'})

continuos_attributes = ['age','TSH','T3','TT4','T4U','FTI']
for attribute in continuos_attributes:
    ann[attribute] = ann[attribute] * 100

def fillNewAttributes(row,attribute):
    if row[attribute] > 0:
        return 'y'
    else:
        return 'n'

ann['TSH_measured'] = ann.apply(lambda row: fillNewAttributes(row,'TSH'), axis=1)
ann['T3_measured'] = ann.apply(lambda row: fillNewAttributes(row,'T3'), axis=1)
ann['TT4_measured'] = ann.apply(lambda row: fillNewAttributes(row,'TT4'), axis=1)
ann['T4U_measured'] = ann.apply(lambda row: fillNewAttributes(row,'T4U'), axis=1)
ann['FTI_measured'] = ann.apply(lambda row: fillNewAttributes(row,'FTI'), axis=1)
display(ann.dtypes)
### Now I can merge all the datasets in one:
data = pd.concat([allDataset,thyroid0387,hypothyroid,sick_euthyroid,ann], ignore_index = True)
display(data.shape)
display(data.dtypes)
# Part 2: Data pre processing
### I will start the data pre processing observing the set of possible values for each attribute:
for column in data.columns:
    listOfValues=set(data[column])
    print(column,": ",listOfValues)
### Sometimes '?' has been used  instead of 'nan', so before counting how many nans are present, I need to do a substitoution:
data=data.replace({"?":np.NAN})
data.isna().sum()
### The 'TBG', 'referral_source' and 'TBG_measured' attributes have too many nan values, I have to drop them. Let's try to drop the 'sex' attribute too:
del data['TBG']
del data['referral_source']
del data['TBG_measured']
del data['sex']
### I can have maximum nine nan values in a row, so I will drop all the rows wtih more than five nan values because they present very few data and aren't good enough for the classification:
data.dropna(axis = 0, thresh = 20, inplace = True)
data.isna().sum()
### For the classification is important that the dataset only has numerical attributes, so I have to encode the categorical values into numerical values:
data = data.replace({"t":1,"f":0, "y":1, "n":0, "hypothyroid":1, "negative":0,"hyperthyroid":2, "F":1, "M":0})
display(data.dtypes)
cols = data.columns[data.dtypes.eq('object')]
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce')
display(data.dtypes)
# Part 3: training of the classifiers
### Before the training starts, I have to find the attributes most related to the target:
corr_values = abs(data[data.columns[0:]].corr()['Target'][:])
corr_values = corr_values.drop('Target')
corr_values = corr_values[corr_values > 0.04]
display(corr_values)
### Another thing that I have to do is to divide the dataset into two sets: the training set and the testing set.
def holdout(dataframe):
  x = dataframe[corr_values.index]
  y = dataframe['Target']
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42) 
  return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = holdout(data)
### Now I will define the classifiers that I'm going to use. I need some classifiers that are friendly with nan values:
classifiers = {
    "XGBClassifier" : XGBClassifier(learning_rate=0.01),
    "CatBoostClassifier" : CatBoostClassifier(max_depth=4,verbose=0),
}
### Now it's time to train the classifiers and discuss the results:
def classification(classifiers, X_train, X_test, y_train, y_test):
    # Creo un dataframe per visualizzare i risultati calcolati
  res = pd.DataFrame(columns=["Classifier", 
                                "Accuracy", 
                                "Precision", 
                                "Recall", 
                                "FScore"])
  for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            pr, rc, fs, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')
            res = res.append({"Classifier": name,"Accuracy": round(metrics.accuracy_score(y_test, y_pred), 4),
                              "Precision": round(pr, 4), "Recall":round(rc, 4), "FScore":round(fs, 4)}, ignore_index=True)
            print("Confusion matrix for: ", name)
            display(confusion_matrix(y_test, y_pred))
  res.set_index("FScore", inplace=True)
  res.sort_values(by="FScore", ascending=False, inplace=True)   
  return res

display(classification(classifiers, X_train, X_test, y_train, y_test))
display(data.shape)
data.Target.value_counts()
### As we can see, 'data' is rather unbalanced, so the accurancy isn't a very good metric. I have a serious problem with the third class. I should try some alternatives transformation and see how the results change.
# Part 4: Alternative transformations
### First of all, I will try to fill the nan values with the spline interpolation:
data1 = data.interpolate(method = 'spline', order = 3)
display(data1.isna().sum())
### Now I have a new dataset, so I have to repeat all the steps previous to the evalutation of the results. Also, now that there aren't nan values, it's possible to use more classifiers:
classifiers1 = {
    "XGBClassifier" : XGBClassifier(learning_rate=0.01),
    "CatBoostClassifier" : CatBoostClassifier(max_depth=4,verbose=0),
    "Nearest Neighbors" : KNeighborsClassifier(4),
    "Decision Tree" : DecisionTreeClassifier(class_weight = 'balanced'),
    "Random Forest": RandomForestClassifier(class_weight = 'balanced',random_state = 1),
    "ExtraTrees": ExtraTreesClassifier(class_weight = 'balanced',random_state = 1),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(256,128,64,32),activation="relu",random_state=1)
}
corr_values = abs(data1[data1.columns[0:]].corr()['Target'][:])
corr_values = corr_values.drop('Target')
corr_values = corr_values[corr_values > 0.04]
display(corr_values)

X_train1, X_test1, y_train1, y_test1 = holdout(data1)

display(classification(classifiers1,X_train1, X_test1, y_train1, y_test1))
### So, we can see that the results haven't changed that much. The most related attributes are the same and the FScore improved slightly. Let's check if there are any differences after a discretization:
def fdiscretizer(attribute,dataframe):
    enc = LabelEncoder()
    dataframe[attribute] = pd.qcut(dataframe[attribute], 20, duplicates='drop')
    dataframe[attribute] = enc.fit_transform(dataframe[attribute])
    dataframe = dataframe.convert_dtypes(convert_integer=True)

data2 = data1.copy()
fdiscretizer('age',data2)
fdiscretizer('TSH',data2)
fdiscretizer('T3',data2)
fdiscretizer('TT4',data2)
fdiscretizer('T4U',data2)
fdiscretizer('FTI',data2)
corr_values = abs(data2[data2.columns[0:]].corr()['Target'][:])
corr_values = corr_values.drop('Target')
corr_values = corr_values[corr_values > 0.04]
display(corr_values)

X_train2, X_test2, y_train2, y_test2 = holdout(data2)

display(classification(classifiers1,X_train2, X_test2, y_train2, y_test2))
### Now 'T3' is one of the most related attributes, but there aren't major changes in the results. FScore got slightly worse.  Maybe it's possible to apply a normalization istead of a discretization:
data3 = ((data1-data1.min())/(data1.max()-data1.min()))*20

corr_values = abs(data3[data3.columns[0:]].corr()['Target'][:])
corr_values = corr_values.drop('Target')
corr_values = corr_values[corr_values > 0.04]
display(corr_values)

X_train3, X_test3, y_train3, y_test3 = holdout(data3)

display(classification(classifiers1,X_train3, X_test3, y_train3, y_test3))
### Now 'T3' has gone again and we can see a tiny improvement from the last time. To obtain better results, I should try to balance the dataset by over-sampling (by adding more samples from under-represented classes) or by under-sampling (by removing samples from over-represented classes). Let's start with an over-sampling method:
smote = SMOTE('not majority',random_state = 1)
X_train_sm, y_train_sm = smote.fit_sample(X_train3,y_train3)
X_test_sm, y_test_sm = smote.fit_sample(X_test3,y_test3)
display(X_train3.shape)
display(X_train_sm.shape)
display(classification(classifiers1,X_train_sm, X_test_sm, y_train_sm, y_test_sm))
### So, we got a huge improvement on all the metrics' score, except for Accuracy. But now that the data is balanced, it is a valid metric too. Now it's time to try an under-sampling method: 
df_negative = data3[data3.Target==0]
df_hyperthyroid = data3[data3.Target==20]
df_hypothyroid = data3[data3.Target==10]

df_negative_downsampled = resample(df_negative,replace=False,n_samples=450,random_state=123)
df_hypothyroid_downsampled = resample(df_hypothyroid,replace=False,n_samples=450,random_state=123)

df_downsampled = pd.concat([df_negative_downsampled,df_hypothyroid_downsampled,df_hyperthyroid])
df_downsampled.Target.value_counts()
X_train4, X_test4, y_train4, y_test4 = holdout(df_downsampled)
display(classification(classifiers1,X_train4, X_test4, y_train4, y_test4))
### Using the same attributes, with the under-sampling of the dataset we got very good results, even if there weren't that many rows.
# Part 5: Final Discussion
### The final dataset that I got is very umbalanced, that's true, but it's normal because only a small percentage of the world population suffers of thyroid disease. Nevertheless, thanks to a good pre-elaboration of the data, I got some very accurate classifiers, that have a good FScore too. I could handle the nan values beacause the results didn't get much worse, and even with the normalization and the discretization they didn't change that much. After having balanced the normalized dataset, we got the best results of the notebook, this means that the work that had been done before was pretty good. In the future, it would be interesting to continue these studies hoping to use more data.