# Code Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from naive_bayes_classifier import NaiveBayes

# Reading the Data

In [None]:
inflammation_data = pd.read_csv('./inflammation_diagnosis.csv', delimiter=';')

# Inspecting the Data

In [None]:
inflammation_data.head(5)

In [None]:
inflammation_data.info()

## Transforming Nominal Data to Booleans

In [None]:
inflammation_data = inflammation_data.replace(to_replace='no', value=False)
inflammation_data = inflammation_data.replace(to_replace='yes', value=True)

inflammation_data.head(5)

Checking if the columns are indeed of type boolean

In [None]:
inflammation_data.info()

## Adding Disease Labels 
Add the disease values to a new label-column and check if the values make sense

In [None]:
inflammation_data['disease'] = [False if (inflammation_data['inflammation'][i] == False) & (inflammation_data['nephritis'][i] == False) else True for i in range(0,len(inflammation_data))]

In [None]:
inflammation_data.head(5)

# Plot the data

plot with seaborn, use different colors and sort the values by colors

In [None]:
rows = 2
columns = 4
fig,ax = plt.subplots(nrows=rows, ncols=columns, figsize=(20,9))
[""]
index = 0 

for row in range(rows):
    for column in range(columns):
        nice_col_name = inflammation_data.columns.to_list()[index]
        nice_col_name = " ".join(word[0].upper() + word[1:] for word in nice_col_name.split("_"))
        
        
        if inflammation_data.columns[index] == 'temperature':
            s = sns.histplot(inflammation_data, ax=ax[row,column], x=inflammation_data.columns[column], kde=True, color='r')
            s.set(xlabel = 'Temperature in ° C', ylabel = "Frequency")
            
        else:
            b = sns.countplot(inflammation_data, x=inflammation_data.columns[index], ax=ax[row,column], order=[False, True])
            b.set(xlabel = nice_col_name, ylabel = 'Frequency') # deleted the % sign
            b.set_ylim(0,100)
            
        index += 1

## Temperature distribution by diseased or healthy
From the temperature plot it seems as one could approximate two gaussian distributions that relate to the two categories diseased or healthy. To verify this, we plot the temperature grouped by the status of 'disease':

In [None]:
groupby_disease = inflammation_data[['temperature', 'disease']].groupby('disease')

In [None]:
diseased = groupby_disease.get_group(name=True)
healthy = groupby_disease.get_group(name=False)

sns.histplot(data=inflammation_data, x=inflammation_data['temperature'], bins=8, label='total distribution', color='b', kde=True)
sns.histplot(data=diseased, x=diseased['temperature'], label='diseased', color='r', kde=True)
sns.histplot(data=healthy, x=healthy['temperature'], label='not diseased', color='g', kde=True)

plt.ylabel('Frequency')
plt.xlabel('Temperature in ° C')

plt.legend()
plt.show()

It seems that, unlike expected from the total distribution, there are no two guassian distributions for diseased and healthy patients. One could assume such a distribution, as an elevated temperature is a typical sign for inflammation. However, looking only at inflammation and nephritis (out of which we generate the disease-column) and temperature, it is visible that there are multiple cases in which relatively low temperatures were measured for patients with either general inflammation or nephritis.

In [None]:
test = inflammation_data[['inflammation', 'nephritis', 'temperature']].sort_values(by='temperature')
print(test[(test.inflammation==True) | (test.nephritis==True)])

# Creating Labels

# Splitting the Data
Count the values of 'disease' to see how the data is distributed

In [None]:
inflammation_data['disease'].value_counts(True)

## Calculating Split Indices
The data split must account for the obvious imbalance in the data. Therefore, calculate the indices of the splits.

# Drop duplicates to avoid the same patients for training and test set

In [None]:
inflammation_data = inflammation_data.drop_duplicates()

In [None]:
train = pd.concat([inflammation_data[inflammation_data['disease'] == True].sample(frac =0.8), inflammation_data[inflammation_data['disease'] == False].sample(frac = 0.8)])
test = inflammation_data.drop(train.index)

In [None]:
set(list(test.index)).isdisjoint(list(train.index))

## Ensuring that the two dataframes do not share a column
The following merge shows duplicate rows across the two dataframes. If empty, there are no duplicates.

In [None]:
result = train.merge(right=test, how='inner', indicator=False)
result

In [None]:
inflammation_data.groupby(["disease", inflammation_data["disease"]]).size()

# TESTING

# positiv posterior soll grösser sein als negative posterior

evidence noch berechnen mit negative and ppositive posterior 

In [None]:
Y_test= test.iloc[:,-1].values
X_test = test.iloc[:,:-1].values

In [None]:
t = NaiveBayes()
p,f = t.fit(train, "disease")
print(p)
print("''''''''''''''")
print(f)