## Balancing the class distribution and Evaluated different models (Task 2)

Import the utility libraries

In [None]:
import pandas as pd
import numpy as np
import random
import math
import tensorflow

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LSTM, Dense

## 1.1 Features’ choice

The **definitive_dataset** is the result of the task1. <br>
Firstly, we fuse all the different codes in a single feature called **codici**.<br>
So we drop the events that do not have a code, and then we decide to drop any features<br>
that do not seem to be important to the task.

In [None]:
path = ""
#insert the path of the directory in which the CVSs are stored
dataset_df = pd.read_csv(path + 'definitive_dataset.csv')
random.seed(17) #random seed which we decided

In [5]:
def find_codes(row):
  """
  The function checks the code of the event, if it is not nan 
  puts the AMD code or STITCH code or ATC code, otherwise
  there isn't a code

  Parameters
  ----------
  row : Series (row of df)
    cotains all the information of an event

  Returns
  -------
  Object
    the code of the event or 'NOCODE'.
  """
  if not (pd.isnull(row['codiceamd'])):
    return row['codiceamd']
  elif not (pd.isnull(row['codicestitch'])):
    return row['codicestitch']
  elif not (pd.isnull(row['codiceatc'])):
    return row['codiceatc']
  return 'NOCODE'


In [6]:
dataset_df['codici'] = dataset_df.apply(lambda row: find_codes(row), axis=1)
dataset_df = dataset_df[dataset_df.codici != 'NOCODE']
dataset_df = dataset_df.drop(columns=['idpasto', 'quantita', 'codiceamd', 'descrizionefarmaco', 'valore', 'codicestitch', 'codiceatc', 'annodiagnosidiabete', 'tipodiabete', 'annodecesso'], axis=1)

In [7]:
dataset_df = dataset_df[['idcentro', 'idana', 'sesso', 'annonascita', 'codici', 'data', 'datamax', 'label']]
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,annonascita,codici,data,datamax,label
0,1,5,M,1942,AMD152,2008-06-20,2013-12-31,1
1,1,5,M,1942,AMD152,2013-08-27,2013-12-31,1
2,1,5,M,1942,AMD086,2013-12-31,2013-12-31,1
3,1,5,M,1942,AMD228,2013-12-31,2013-12-31,1
4,1,5,M,1942,A10BA02,2005-01-18,2013-12-31,1


## 1.2 Up-sample the minority class

Instead of just duplicate some rows of the dataset that represented events related to patients with positive labels,<br>
we create some likely copies of the patients, in this way we want to balance the dataframe

In [8]:
def idcentroCopies(row, max_idcentro):
  """
  The function sum the value of idcentro in the
  given row and the maximum idcentro to get a new idcentro
  to give to the copy patient

  Parameters
  ----------
  row : Series (row of df)
    cotains all the information of an event
  max_idcentro : int
    represents the max value of idcentro in the df

  Returns
  -------
  Int
    the sum between the value of idcentro plus the maximum idcentro.
  """
  return row['idcentro'] + max_idcentro

In [9]:
def createCopies(dataset_df, last_original_idcentro):
  """
  The function create some copies of positive patients,
  then shuffled and deleted some rows representing some events that occurred.
  All copies have distinct ids which is calculeted with the
  function 'idcentroCopies'.

  Parameters
  ----------
  dataset_df : DataFrame
    definitive dataset after cleaning 
  last_original_idcentro: int
    represents the max value of idcentro in the original df

  Returns
  -------
  df
    the df with all original and copied events
  """
  #pick the maximum idcentro in the df and plus 1, otherwise could create a copy having same id of max
  max_idcentro = dataset_df.loc[dataset_df['idcentro'].idxmax()]
  max_idcentro = max_idcentro['idcentro']
  max_idcentro += 1

  #pz_copies is a df that contains all events from positive patients and considers only original events.
  pz_copies = dataset_df.loc[(dataset_df['label'] == 1) & (dataset_df['idcentro'] <= last_original_idcentro)]
  pz_copies['data'] = pd.to_datetime(pz_copies['data'])
  c1 = (pz_copies['data'] + pd.DateOffset(months=6)) > pz_copies['datamax']
  c2 = pz_copies['codici'].isin(codes)
  pz_copies = pz_copies.loc[~(c1 & c2)]

  #shuffle all the rows 
  pz_copies.sample(frac=1, random_state=17).reset_index(drop=True)
  size = random.randint(0, pz_copies.shape[0]//2) #pick a random number from 0 to the half of events
  #pick randomly some rows according to size
  arr_indices_top_drop = np.random.default_rng().choice(pz_copies.index, size=size, replace=False) 
  pz_copies.drop(index=arr_indices_top_drop, inplace=True)
  pz_copies = pz_copies.reset_index(drop=True)

  #put new idcentro for copies
  pz_copies['idcentro'] = pz_copies.apply(idcentroCopies, args=(max_idcentro,), axis=1)
  dataset_df = pd.concat([dataset_df, pz_copies]).reset_index(drop=True)
  return dataset_df

In [10]:
codes = ['AMD047', 'AMD048', 'AMD049', 'AMD071', 'AMD081', 'AMD082', 'AMD208', 'AMD303'] #macro-events
last_original_idcentro = dataset_df.loc[dataset_df['idcentro'].idxmax()]
last_original_idcentro = last_original_idcentro['idcentro']

In [11]:
tmp_df = dataset_df[['idcentro','idana','label']]
tmp_df = tmp_df.drop_duplicates(keep='first').reset_index(drop=True)
n_label1 = tmp_df.loc[tmp_df['label'] == 1].shape[0]
n_label0 = tmp_df.shape[0] - n_label1
print("total number of patients:", tmp_df.shape[0])
print("number of positive patients:", n_label1)
print("number of negative patients:", n_label0)

total number of patients: 47702
number of positive patients: 8980
number of negative patients: 38722


In [12]:
e_n_label1 = dataset_df.loc[dataset_df['label'] == 1].shape[0]
e_n_label0 = dataset_df.shape[0] - e_n_label1
print("total number of events:", dataset_df.shape[0])
print("total number of positive events:", e_n_label1)
print("total number of negative events:", e_n_label0)

total number of events: 15773156
total number of positive events: 3273590
total number of negative events: 12499566


**m** represents the percentage change in the number of negative patients compared to the number of positive patients.<br>
So create **m** copies for every positive patients.<br>
It means we got m copies + original = the total number of minority class after balancing

In [None]:
m = math.floor((n_label0 - n_label1)/(n_label1))

for i in range(m):
    dataset_df = createCopies(dataset_df,last_original_idcentro)

In [14]:
tmp_df = dataset_df[['idcentro','idana','label']]
tmp_df = tmp_df.drop_duplicates(keep='first').reset_index(drop=True)
n_label1 = tmp_df.loc[tmp_df['label'] == 1].shape[0]
n_label0 = tmp_df.shape[0] - n_label1
print("total number of patients:", tmp_df.shape[0])
print("number of positve patients:", n_label1)
print("number of negative patients:", n_label0)

total number of patients: 74642
number of positve patients: 35920
number of negative patients: 38722


In [15]:
e_n_label1 = dataset_df.loc[dataset_df['label'] == 1].shape[0]
e_n_label0 = dataset_df.shape[0] - e_n_label1
print("total number of events:", dataset_df.shape[0])
print("total number of positive events:", e_n_label1)
print("total number of negative events:", e_n_label0)
diff = e_n_label0 - e_n_label1 #difference between the majority class and the minority

total number of events: 22949247
total number of positive events: 10449681
total number of negative events: 12499566


In [16]:
dataset_df = dataset_df.drop(columns=['datamax'], axis=1)

## 1.3 Manual under-sampler 
Randomly delete some rows that were labeled as negative.

In [17]:
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,annonascita,codici,data,label
0,1,5,M,1942,AMD152,2008-06-20,1
1,1,5,M,1942,AMD152,2013-08-27,1
2,1,5,M,1942,AMD086,2013-12-31,1
3,1,5,M,1942,AMD228,2013-12-31,1
4,1,5,M,1942,A10BA02,2005-01-18,1


In [18]:
#diff is the number of rows we need to delete. To perform undersampling, we delete diff negative events
to_drop = dataset_df[dataset_df['label'] == 0].sample(diff)
dataset_df = dataset_df.drop(to_drop.index)
dataset_df = dataset_df.reset_index(drop=True)

In [19]:
e_n_label1 = dataset_df.loc[dataset_df['label'] == 1].shape[0]
e_n_label0 = dataset_df.shape[0] - e_n_label1
print("total number of events:", dataset_df.shape[0])
print("total number of positive events:", e_n_label1)
print("total number of negative events:", e_n_label0)

total number of events: 20899362
total number of positive events: 10449681
total number of negative events: 10449681


In [20]:
tmp_df = dataset_df[['idcentro','idana','label']]
tmp_df = tmp_df.drop_duplicates(keep='first').reset_index(drop=True)
n_label1 = tmp_df.loc[tmp_df['label'] == 1].shape[0]
n_label0 = tmp_df.shape[0] - n_label1
print("total number of patients:", tmp_df.shape[0])
print("number of positve patients:", n_label1)
print("number of negative patients:", n_label0)

total number of patients: 74642
number of positve patients: 35920
number of negative patients: 38722


## 2. Dataset Evaluation

encode some feautre: **code** and **gender**, transform **date** into Unix timestamp , <br> 
add a new column **eta evento** which is the age of the patient when the event occurred.

In [21]:
def getEventsAge(row):
  """
  The function calculates the age of the patient at the time of the event.

  Parameters
  ----------
  row : Series (row of df)
    cotains all the information of an event

  Returns
  -------
  int
    the age at the time of the event.
  """
  anno_nascita = row['annonascita']
  anno = row['data'].year
  age = anno - anno_nascita
  return age

In [22]:
le_codes = LabelEncoder()
le_genders = LabelEncoder()

In [23]:
dataset_df['codici'] = le_codes.fit_transform(dataset_df['codici'])
dataset_df=dataset_df

In [24]:
dataset_df['sesso'] = le_genders.fit_transform(dataset_df['sesso'])
dataset_df=dataset_df

In [25]:
dataset_df['data'] = pd.to_datetime(dataset_df['data'], format="%Y-%m-%d")
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,annonascita,codici,data,label
0,1,5,1,1942,149,2008-06-20,1
1,1,5,1,1942,149,2013-08-27,1
2,1,5,1,1942,126,2013-12-31,1
3,1,5,1,1942,159,2013-12-31,1
4,1,5,1,1942,16,2005-01-18,1


In [26]:
dataset_df['etaevento'] = dataset_df.apply(lambda row: getEventsAge(row), axis= 1)

In [27]:
dataset_df = dataset_df[['idcentro', 'idana', 'sesso', 'codici', 'data', 'etaevento', 'label']]

In [28]:
dataset_df['data'] = dataset_df['data'].apply(lambda x: (x.value)/10**9)

In [29]:
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label
0,1,5,1,149,1213920000.0,66,1
1,1,5,1,149,1377562000.0,71,1
2,1,5,1,126,1388448000.0,71,1
3,1,5,1,159,1388448000.0,71,1
4,1,5,1,16,1106006000.0,63,1


In [30]:
#save the balanced dataframe in a new file csv
new_path = path + 'balanced_df.csv'
dataset_df.to_csv(new_path, mode='w', header=True, index=False)

## 2.1 LSTM
Evaluated our balanced dataset upon the models **LSTM** using the library *keras*. <br>
Our model was **sequential** and consisted in a LSTM 
layer and a Dense layer with **ReLU** as activation function

In [31]:
dataset_df = pd.read_csv(new_path)

In [32]:
ds_df = dataset_df.drop(columns='etaevento', axis=1)

In [33]:
features = ds_df.drop("label", axis=1) #contains all features except 'label'
label = ds_df["label"]

In [34]:
scaler = StandardScaler()
#Calculates normalization parameters (mean and standard deviation) on training features,
#Transforms training and test features according to these parameters.
features[features.columns[:]] = scaler.fit_transform(features[features.columns[:]])

In [35]:
#20% of the data will be used as a test set, where X are sets of features and y are sets of label
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=17)

In [36]:
#transform the training and test data into a structure that can be used as input for an LSTM neural network model
X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
print("number of test set:", X_test.shape[0])
print("number of train set:", X_train.shape[0])

number of test set: 4179873
number of train set: 16719489


In [37]:
model = Sequential()
#the size of the internal memory is set to 32
model.add(LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2])))
#adds a dense (fully connected) structure to the model which has only one unit and uses the ReLU as activation function
model.add(Dense(1, activation="relu"))

In [38]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [39]:
history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test))

2023-02-09 19:39:57.974273: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




In [40]:
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 84.32%


### test prediction

In [41]:
model.save(path+'Model_LSTM.h5')

In [42]:
lstm_model = load_model(path+'Model_LSTM.h5')
#if you want to load the trained model

In [43]:
y_pred = lstm_model.predict(X_test)



In [44]:
y_pred_binary = (y_pred > 0.5).astype(int)

In [45]:
f_score = f1_score(y_test, y_pred_binary)
print("F-Score: {:.5f}".format(f_score))

F-Score: 0.81400


In [46]:
'''
[[true negative, false positive]
[false negative, true positive]]
'''
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[2090270       0]
 [ 655430 1434173]]


## 2.2 T-LSTM

Has same architecture presented for the LSTM but we fed it with a dataset that had also<br>
the feature **eta evento** so that it would have been aware about the specific time

In [47]:
features = dataset_df.drop("label", axis=1)
label = dataset_df["label"]

In [48]:
scaler = StandardScaler()
features[features.columns[:]] = scaler.fit_transform(features[features.columns[:]])

In [49]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=17)

In [50]:
X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

In [51]:
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1, activation="relu"))

In [52]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [53]:
history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test))



In [54]:
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 84.32%


In [55]:
model.save(path+'Model_T-LSTM.h5')

### test prediction

In [56]:
t_lstm_model = load_model(path+'Model_T-LSTM.h5')
#if you want to load the trained model

In [57]:
y_pred = t_lstm_model.predict(X_test)



In [58]:
y_pred_binary = (y_pred > 0.5).astype(int)

In [59]:
f_score = f1_score(y_test, y_pred_binary)
print("F-Score: {:.2f}".format(f_score))

F-Score: 0.81


In [60]:
'''
[[true negative, false positive]
[false negative, true positive]]
'''
conf_matrix = confusion_matrix(y_test, y_pred_binary)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[2090270       0]
 [ 655425 1434178]]


## 2.3 PubMedBert

In [61]:
dataset_df['codici'] = le_codes.inverse_transform(dataset_df['codici'])
dataset_df=dataset_df

In [62]:
dataset_df['sesso'] = le_genders.inverse_transform(dataset_df['sesso'])
dataset_df=dataset_df

In [63]:
dataset_df['data'] = pd.to_datetime(dataset_df['data'], unit='s')
dataset_df['data'] = dataset_df['data'].dt.strftime('%Y-%m-%d')

In [64]:
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label
0,1,5,M,AMD152,2008-06-20,66,1
1,1,5,M,AMD152,2013-08-27,71,1
2,1,5,M,AMD086,2013-12-31,71,1
3,1,5,M,AMD228,2013-12-31,71,1
4,1,5,M,A10BA02,2005-01-18,63,1


In [None]:
#read the file which contains the AMD codes and their meaning 
codici_amd = pd.read_csv(path+"amd_codes_for_bert.csv")

In [None]:
codici_amd = codici_amd.rename(columns={'codice':'codici'})

In [None]:
codici_amd.head()

Unnamed: 0,codici,meaning
0,AMD090,Diet only
1,AMD140,Self control
2,AMD215,Number of strips prescribed per week
3,AMD228,Integrated management
4,AMD086,Self-monitoring of blood glucose


In [None]:
#read the file csv to match ATC code with the names of the active ingredients
codici_atc = pd.read_csv(path+"atc_info_nodup.csv")

In [None]:
codici_atc.head()

Unnamed: 0,codiceatc,first,last,len,numpatients,numdates,atc_nome
0,A10AB01,1995-01-18,2019-10-01,7,33748,188565,insulin (human)
1,A10AB04,2004-01-28,2019-10-03,7,144229,1053650,insulin lispro
2,A10AB05,2001-07-27,2019-09-19,7,111565,970495,insulin aspart
3,A10AB06,2005-01-01,2019-09-26,7,58367,415367,insulin glulisine
4,A10AC01,2004-01-12,2019-07-17,7,30318,135859,insulin (human)


In [None]:
codici_atc = codici_atc[['codiceatc', 'atc_nome']]

In [None]:
codici_atc = codici_atc.rename(columns={'codiceatc':'codici', 'atc_nome':'meaning'})

In [None]:
codici_atc.head()

Unnamed: 0,codici,meaning
0,A10AB01,insulin (human)
1,A10AB04,insulin lispro
2,A10AB05,insulin aspart
3,A10AB06,insulin glulisine
4,A10AC01,insulin (human)


In [None]:
df_merge1 = pd.merge(dataset_df, codici_amd, on='codici', how='left')

In [None]:
df_merge1.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label,meaning
0,1,5,M,AMD152,2008-06-20,66,1,Self-monitoring of blood glucose
1,1,5,M,AMD152,2013-08-27,71,1,Self-monitoring of blood glucose
2,1,5,M,AMD086,2013-12-31,71,1,Self-monitoring of blood glucose
3,1,5,M,AMD228,2013-12-31,71,1,Integrated management
4,1,5,M,A10BA02,2005-01-18,63,1,


In [None]:
df_merge2 = pd.merge(df_merge1, codici_atc, on='codici', how='left')

In [None]:
df_merge2.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label,meaning_x,meaning_y
0,1,5,M,AMD152,2008-06-20,66,1,Self-monitoring of blood glucose,
1,1,5,M,AMD152,2013-08-27,71,1,Self-monitoring of blood glucose,
2,1,5,M,AMD086,2013-12-31,71,1,Self-monitoring of blood glucose,
3,1,5,M,AMD228,2013-12-31,71,1,Integrated management,
4,1,5,M,A10BA02,2005-01-18,63,1,,metformin


In [None]:
df_merge2['meaning'] = df_merge2['meaning_x'].fillna(df_merge2['meaning_y'])
df_merge2.drop(['meaning_x', 'meaning_y'], axis=1, inplace=True)

In [None]:
df_merge2.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label,meaning
0,1,5,M,AMD152,2008-06-20,66,1,Self-monitoring of blood glucose
1,1,5,M,AMD152,2013-08-27,71,1,Self-monitoring of blood glucose
2,1,5,M,AMD086,2013-12-31,71,1,Self-monitoring of blood glucose
3,1,5,M,AMD228,2013-12-31,71,1,Integrated management
4,1,5,M,A10BA02,2005-01-18,63,1,metformin


In [None]:
df_merge2.to_csv(path+'balanced_df_PMB.csv', mode='w', header=True, index=False)

In [None]:
dataset_df.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label
0,1,5,M,AMD152,2008-06-20,66,1
1,1,5,M,AMD152,2013-08-27,71,1
2,1,5,M,AMD086,2013-12-31,71,1
3,1,5,M,AMD228,2013-12-31,71,1
4,1,5,M,A10BA02,2005-01-18,63,1


In [None]:
pmb_df = pd.read_csv(path+'balanced_df_PMB.csv')

In [None]:
pmb_df.head()

Unnamed: 0,idcentro,idana,sesso,codici,data,etaevento,label,meaning
0,1,5,M,AMD152,2008-06-20,66,1,Self-monitoring of blood glucose
1,1,5,M,AMD152,2013-08-27,71,1,Self-monitoring of blood glucose
2,1,5,M,AMD086,2013-12-31,71,1,Self-monitoring of blood glucose
3,1,5,M,AMD228,2013-12-31,71,1,Integrated management
4,1,5,M,A10BA02,2005-01-18,63,1,metformin


In [None]:
pmb_df = pmb_df[['idcentro','idana','sesso','data','meaning','etaevento','label']]

In [None]:
pmb_df.head()

Unnamed: 0,idcentro,idana,sesso,data,meaning,etaevento,label
0,1,5,M,2008-06-20,Self-monitoring of blood glucose,66,1
1,1,5,M,2013-08-27,Self-monitoring of blood glucose,71,1
2,1,5,M,2013-12-31,Self-monitoring of blood glucose,71,1
3,1,5,M,2013-12-31,Integrated management,71,1
4,1,5,M,2005-01-18,metformin,63,1
