# DATA MINING - GRUPPO 2
## Diabets dataset
The aim of this project is to create a model to predict if
a patient will be readmitted or not after a specific encounter.

The first thing is to import the dataset

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
plt.style.use('ggplot')

df = pd.read_csv("dataset_diabetes/diabetic_data.csv", low_memory=False, delimiter=',', na_values='?')
df

The dataset contains 101766 records with 50 columns. The columns
are:

In [None]:
print(df.columns.to_list())

let's unify the '> 30' and 'no' class value

In [None]:
def unify_value(x: pd.Series) -> pd.Series:
    if x['readmitted'] == '>30':
        x['readmitted'] = 'NO'
    return x

df = df.apply(unify_value, axis=1)


Some columns contains null values, in particular

In [None]:
df.isna().apply(lambda x: f'{round((sum(x) / df.shape[0]) * 100, 1)}%')

Let's check if the 2.2% of null value in race are correlated with the class
label

In [None]:
amount = df[(df['race'].isnull()) & (df['readmitted'] == '<30')].shape[0]
total = df[(df['readmitted'] == '<30')].shape[0]
percentage = ( amount / total) * 100
percentage

There are 188 records with our target class value, so we decide to
maintain them as "not assigned"

In [None]:
df = df.drop(df[df['race'].isnull()].index)

Many of the pathologies reported in the dataset (afferent to the diag columns) are part of disease macrogroups (as indicated by documentation). Consequently, we decided to group them. Particular is the category other, which contains ranges of pathologies not very frequent in the dataset.

Since "weight", "payer_code", "medical_specialty" have a high percentage of null
value, we can remove them

In [None]:
df = df.drop(columns=['weight', 'payer_code', 'medical_specialty'])

There are some ids that are equivalent, so we can unify them

In [None]:
def merge_ids(series: pd.Series) -> pd.Series:
    if series['admission_type_id'] == 5 or series['admission_type_id'] == 6:
        series['admission_type_id'] = 8
    if series['discharge_disposition_id'] == 18 or series['discharge_disposition_id'] == 26:
        series['discharge_disposition_id'] = 25
    if series['admission_source_id'] == 9 or series['admission_source_id'] == 15 or series['admission_source_id'] == 17 or series['admission_source_id'] == 21:
        series['admission_source_id'] = 20
    return series

df = df.apply(merge_ids, axis=1)

Some informations about the column type and the null values

In [None]:
df.info()

Let's fix the types

In [None]:
for attribute in df.columns:
    if df[attribute].dtype == np.object:
        df[attribute] = df[attribute].astype('category')

df['admission_type_id'] = df['admission_type_id'].astype('category')
df['discharge_disposition_id'] = df['discharge_disposition_id'].astype('category')
df['admission_source_id'] = df['admission_source_id'].astype('category')

df.info()

Since there are two columns with only one possible value, we can drop them

In [None]:
df = df.drop(columns=['examide', 'citoglipton'])

We are not interested in patient number and encounter id

In [None]:
df = df.drop(columns=['encounter_id', 'patient_nbr'])

The most frequent value for the categorical attributes are:

In [None]:
info_columns = df.describe(include='category').T
info_columns

We can calculate the percentage of frequency, so we can decide
which columns have a very low or very high variability.

The upper bound is set to 99%

In [None]:
upper_bound = 99
lower_bound = 0

info_columns['freq'] = info_columns['freq'].apply(lambda x: round((x / df.shape[0]) * 100, 1))
for info in info_columns.index:
    if info_columns.loc[info]['freq'] > upper_bound or info_columns.loc[info]['freq'] < lower_bound:
        df = df.drop(columns=[info])

The distribution for the numeric attribute

In [None]:
df.select_dtypes(include=['int64']).hist(figsize=(20,15))
plt.show()

About categorical attributes

In [None]:
categorical_attr = df.select_dtypes(include=['category']).columns.to_list()
for attribute in categorical_attr:
    val = df[attribute].value_counts()
    val.plot(kind='bar', figsize=(10,5))
    plt.ylabel('count')
    plt.xlabel(attribute)
    plt.show()

How the numerical feature are correlated with the class labels

In [None]:
numericAttribute = df.select_dtypes(include=['int64']).columns.to_list()
numericAttribute.append('readmitted')

for attribute in numericAttribute:
    if attribute != 'readmitted':
        sb.kdeplot(x= df[attribute], hue= 'readmitted', data=df[numericAttribute])
        plt.show()


How the categorical feature are correlated with the class labels

In [None]:
categoricalAttributes = df.select_dtypes(include=['category']).columns.to_list()
categoricalAttributes.append('readmitted')

for attribute in categoricalAttributes:
    if attribute != 'readmitted':
        attributeCounts = (df.groupby(['readmitted'])[attribute]
                     .value_counts(normalize=True)
                     .rename('percentage')
                     .mul(100)
                     .reset_index()
                     .sort_values(attribute))
        p = sb.barplot(x=attribute, y="percentage", hue="readmitted", data=attributeCounts)
        plt.show()

Check the outliers

In [None]:
df.select_dtypes(include=['int64']).plot(kind='box', subplots=True, sharex=False, sharey=False, figsize=(15, 27), layout=(5, 4))
plt.show()

Binarization

In [None]:
cat_attributes = list(df.select_dtypes(include = ['category']).columns)
cat_attributes.remove('readmitted')
cat_attributes.remove('diag_1')
cat_attributes.remove('diag_2')
cat_attributes.remove('diag_3')
df2 = pd.get_dummies(df, columns = cat_attributes)
new_attr_list = list(df2.columns)
new_attr_list.remove('diag_1')
new_attr_list.remove('diag_2')
new_attr_list.remove('diag_3')
new_attr_list.remove('readmitted')
df2 = df2[new_attr_list]

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import KFold, cross_val_score
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

#x_train, x_test, y_train, y_test = train_test_split(df2[new_attr_list], df['readmitted'], test_size=0.4, random_state=0)
x = np.array(df2.values)
y = np.array(df['readmitted'].values)
# define oversampling strategy

seed = 121
test_size = .2

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed)
# define oversampling strategy
SMOTE = SMOTE()

# fit and apply the transform
x_train_SMOTE, y_train_SMOTE = SMOTE.fit_resample(x_train, y_train)

models = []
models.append(('KneiboarsClassifier', KNeighborsClassifier(3)))
models.append(('C45', DecisionTreeClassifier(criterion='entropy')))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier( splitter="random", random_state=1, max_depth=5, max_leaf_nodes=15)))
models.append(('RandomForestClassifier', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=24)))
#models.append(('MLPClassifier',MLPClassifier(alpha=1, max_iter=1000)))
models.append(('AdaBoostClassifier',  AdaBoostClassifier()))
models.append(('GaussianNaiveBayes', GaussianNB()))
#models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed, shuffle = True)
    cv_results = cross_val_score(model, x_train_SMOTE, y_train_SMOTE, cv=kfold, scoring='accuracy')
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

### **Evaluation**

##### **Do predictions on test set**

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
model = KNeighborsClassifier(3)
model.fit(x_train_SMOTE, y_train_SMOTE)
predictions = model.predict(x_test)


In [None]:
print(f'Accuracy: {accuracy_score(y_test, predictions):.2f}')

In [None]:
def make_confusion_matrix(cf, categories='auto', cbar=True, cmap='Blues', title=None):
    group_counts = [f'{value}\n' for value in cf.flatten()]

    box_labels = [f'{v1}'.strip() for v1 in group_counts]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])

    sb.heatmap(cf, annot=box_labels, fmt='', cmap=cmap, cbar=cbar, xticklabels=categories, yticklabels=categories)

    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    if title:
        plt.title(title)

In [None]:
categories = ['No', 'Yes']
make_confusion_matrix(confusion_matrix(y_test, predictions), categories=categories, cmap='binary')

In [None]:
print('Classification report')
print(classification_report(y_test, predictions))


In [None]:
##### **Compute the Roc Curve for each class**

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
one_hot_encoding = np.array(pd.get_dummies(y_test, columns = ['readmitted']))


probs = []

for  _,model in models:
    model.fit(x_train_SMOTE, y_train_SMOTE)
    probs.append(model.predict_proba(x_test))
    
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(len(np.unique(y))):
    plt.figure()
    m = 0

    for name, _ in models:
        fpr[i], tpr[i], _ = roc_curve(one_hot_encoding[:, i], probs[m][:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

        plt.plot(fpr[i], tpr[i], lw=2, label=name + f' (area = {roc_auc[i]:.2f}')
        m += 1

    plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Class = ' + str(np.unique(y)[i]))
    plt.legend(loc='lower right')

    plt.show()  