This notebook is used to clean the columns (feature selection) of the table df_master_X, X being the selected number of months in the variable months. At the end of the notebook there are several plots that have been useful to understand the data.

After the cleaning the train and test tables for the selected number of months are saved to csv files.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

path = '/content/drive/MyDrive/TFG ICO/Notebooks/Tables/'

In [None]:
months = 36

### Load the training set
Load the data from csv file, split it into training and testing sets, and create the dataframe to be analysed from the training set only.

In [None]:
def load(num_months):
    df = pd.read_csv(path + 'df_master_{}.csv'.format(num_months))
    
    X = df.loc[:, df.columns != 'is_dead']
    y = df['is_dead']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0, stratify=y)

    df_train = X_train.copy()
    df_train['is_dead'] = y_train.copy()

    df_test = X_test.copy()
    df_test['is_dead'] = y_test.copy()

    print(f'For {num_months} months... \nOriginal dataframe: {len(df)} patients')
    print('Training set:', len(df_train), 'patients')
    print('Test set:', len(df_test), 'patients')

    return df_train, df_test

df, df_test = load(months)
df.head()

In [None]:
num_dead = df.is_dead.sum()
perc_dead = round(100*num_dead/len(df), 1)
print("There is a {}% of dead patients in this dataframe ({} out of {})".format(perc_dead, num_dead, len(df)))

Generate descriptive statistics.

In [None]:
df_descr = df.describe().transpose()
df_descr['count_null'] = len(df) - df_descr['count']
df_descr['perc_null'] = round(100*df_descr['count_null']/len(df), 2)
df_descr.head()

Show columns with a very little amount of non zero values, and the percentage of those which correspond to dead patients

In [None]:
threshold_null = 99  # In %
threshold_dead = 50  # In %

df_aux = df_descr[df_descr.perc_null > threshold_null]
cols_to_drop = []

for col in df_aux.index:
    rows_not_empty = df[~df[col].isna()]
    if len(rows_not_empty) == 0:
      cols_to_drop.append(col)
    else:
      num_dead = rows_not_empty['is_dead'].sum()
      perc_dead = 100*num_dead/len(rows_not_empty)

      if perc_dead <= threshold_dead:
          cols_to_drop.append(col)
          #print('{} dropped -- {}% dead ({} out of {})'.format(col, round(perc_dead, 1), num_dead, len(rows_not_empty)))
      else:
          #pass
          print('{} not dropped -- {}% dead ({} out of {})'.format(col, round(perc_dead, 1), num_dead, len(rows_not_empty)))
    
df.drop(columns=cols_to_drop, inplace=True)
df_descr.drop(index=cols_to_drop, inplace=True)

print('\n', len(cols_to_drop), 'dropped columns: \n', sorted(cols_to_drop))

## Save clean tables to csv

Apply the same transformations to the test set. Save both tables.

In [None]:
df.to_csv(path + f'df_train_{months}.csv', index=False)

df_test = df_test[df.columns]
df_test.to_csv(path + f'df_test_{months}.csv', index=False)

***
# PLOTS

In [None]:
df.is_woman.value_counts()

In [None]:
fig = px.histogram(df, x='is_woman', title='Proportion of women and men', histnorm='percent')
fig.show()

In [None]:
df.C50.value_counts()

In [None]:
fig = px.histogram(df, x='C50', title='Number of C50 diagnoses per patient', histnorm='percent')
fig.show()

## Boxplots

In [None]:
codes_no_date = [cod for cod in df.columns if len(cod) in [3, 7] and cod not in ['is_dead']]
codes_dates = [cod for cod in df.columns if '_' in cod and cod not in ['is_dead','is_woman']]

In [None]:
fig = go.Figure()

for col in codes_no_date:
    fig.add_trace(go.Box(x=df[col], name=col))

fig.update_layout(height=1000, showlegend=False)
fig.show()

In [None]:
fig = go.Figure()

for col in codes_dates:
    fig.add_trace(go.Box(x=df[col], name=col))

fig.update_layout(height=1000, showlegend=False)
fig.show()