# DATA PREPARATION

## SETUP

In [114]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [None]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

## EXPLORE & PREPROCESS REAL DATA

### Dimension

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

### Detect sensitive columns

In [None]:
# detect sensitive columns by intuition by their name
print(f"\columns: {diabetes.columns}\n")

# identify identity sensible data: 
sensitive_column_names = ['race', 'gender', 'weight', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']
print(f"\nSensitive columns: {sensitive_column_names}\n")

###  Check columns values & distribution

In [None]:
# SINGLE COLUMN: check columns values & distribution
def visualize_columns_distributions(df):
    for col in df.columns:
        print(f"\n\nColumn: {col}")

        # Get value counts of data
        val_counts = df[col].value_counts(dropna = False)   

        # prepare to print more pretty way
        counts_df = pd.DataFrame({'Items': val_counts})    

        # Print 
        print(counts_df)
    
# call to columns_distibution function
visualize_columns_distributions(diabetes)    

###  Generalize 'Nan' race values to 'Other'

In [None]:
# Generalize Nan race as 'Other'
import numpy as np
diabetes.loc[diabetes["race"].isin([np.nan ,'Other']), "race"] = "Other"

# validate change
diabetes.race.value_counts() 

###  Remove 'Unknown/Invalid' gender values

In [None]:
# 'Unknown/Invalid' # only 3 registry, not possible to define gender, best option would be to remove them 
diabetes[diabetes['gender'] == 'Unknown/Invalid'] 

# removing  'Unknown/Invalid' gender data
print(f"Shape before drop: {diabetes.shape}")
diabetes = diabetes.drop(diabetes[diabetes["gender"] == 'Unknown/Invalid'].index)

# validating results (only 3 less)
print(f"Shape after drop: {diabetes.shape}")

###  Check for null values per column

In [None]:
# nulls per columns (percentage)
diabetes.isna().sum() * 100 / len(diabetes)

###  Drop "weight" column

In [None]:
# remove weight column form dataframe 96.858387% null values
print(f"Columns before remove {len(diabetes.columns)}")
diabetes = diabetes.drop('weight', axis=1)
print(f"Columns after remove {len(diabetes.columns)}")

###  Check for variability

Columns that have only one value

In [None]:
# Drop columns without variability
def columns_without_variability(df):    
    """
    Function that is responsible to determine which columnns has no variability (those which has only 1 value).    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        list(): list of variables without variability.
    """    
    sobran = []

    cols = df.columns
    for col in cols:
        if len(df[col].unique()) < 2:
            sobran.append(col)

    return sobran

# obtener listado de columnas sin variabilidad en una lista
cols_without_variability = columns_without_variability(diabetes)

# remove columns without variabitly
print(f"Columns without variability: {cols_without_variability}")
print(f"Columns before remove {len(diabetes.columns)}")
diabetes = diabetes.drop(columns = cols_without_variability)
print(f"Columns after remove {len(diabetes.columns)}")

### Check for single value entries

Columns that have more than one value, but only a single instance for one of the values.

In [None]:
def determine_single_value_entries(df):
    rows_to_check = []  # List to hold the rows matching the criteria
    cols_to_check = []
    for col in df.columns:
        # Get the value counts for the column
        value_counts = df[col].value_counts()

        # Check if exactly one value has a count of 1
        if (value_counts == 1).sum() == 1:
            # Get the value that appears exactly once
            single_value = value_counts[value_counts == 1].index[0]
            # add column name
            cols_to_check.append(col)
            
            # Select rows where this single value appears
            matching_rows = df[df[col] == single_value]
            
            # Append these rows to the list
            rows_to_check.append(matching_rows)

    # Concatenate all the matching rows into a single dataframe (if needed)
    result_df = pd.concat(rows_to_check, ignore_index=True) if rows_to_check else pd.DataFrame()

    return result_df,cols_to_check

# determine single value entries
matching_rows_df, cols_to_check = determine_single_value_entries(diabetes)

# check data relevancy
for col in cols_to_check:
    value_counts = diabetes[col].value_counts(dropna = False)
    print(f"Single value entry in column {col} :  {value_counts[value_counts == 1].index[0]}")
    #print(f"Distribution \n{value_counts}")


### Correct single value entries

The columns `metformin-pioglitazone`, `glimepiride-pioglitazone`, and `acetohexamide` show extreme imbalance in their distributions, with only one non-"No" entry each. Remove from the original dataframe due to their lack of significant variability.

In [None]:
# Filter from matching_rows_df those only they have 1 Steady instances
steadys = ["metformin-pioglitazone","glimepiride-pioglitazone","acetohexamide"] 
val ="Steady"

# Filter rows where exactly one of the specified columns has the value 'Steady'
filtered_df = diabetes.loc[
    (diabetes[steadys[0]] == val).astype(int) +
    (diabetes[steadys[1]] == val).astype(int) +
    (diabetes[steadys[2]] == val).astype(int) == 1
]

# Remove rows and columns as they do not have variability after removing
print(f"Actual dimension: {diabetes.shape}")
print(f"Removing rows: {filtered_df.index}")
diabetes = diabetes.drop(filtered_df.index)
print(f"After dimension: {diabetes.shape}")
print("Checking column variability")
cols_without_variability = columns_without_variability(diabetes)
print(f"Removing columns: {cols_without_variability}")
diabetes = diabetes.drop(columns = cols_without_variability)
print(f"Later dimension: {diabetes.shape}")

Let's check `admission_source_id:13` and `payer_code:FR` cases to determine their actions.

In [None]:
# Remove single 2 rows 
print(f"Actual dimension: {diabetes.shape}")
print(f"Removing {len(diabetes[(diabetes['admission_source_id'] == 13) ^ (diabetes['payer_code'] == 'FR')])} rows.")
diabetes = diabetes.drop(diabetes[(diabetes["admission_source_id"] == 13) ^ (diabetes["payer_code"] == "FR")].index)
print(f"After dimension: {diabetes.shape}")

###  Check dtype uniformity: 

In [None]:
# data information
print(f"\nData information: {diabetes.dtypes}\n")

In [None]:
# Numerical data comprobations
num_cols = diabetes.select_dtypes(include='int64')

# Check column values, correspond to dtypes
for cat in num_cols.columns:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Change admission_type_id, discharge_disposition_id  &  admission_source_id  to categoricals

In [None]:
# admission_type_id, discharge_disposition_id  &  admission_source_id  are categorical no numericals. Change
cols_to_change = ["admission_type_id","discharge_disposition_id", "admission_source_id"]
diabetes[cols_to_change] =  diabetes[cols_to_change].astype(str)
diabetes.info()

### Check numerical data

In [None]:
# get updated list
num_cols = diabetes.select_dtypes(include='int64')

# Check column values, correspond to dtypes
for cat in num_cols.columns:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Check categorical data 

In [None]:
# optimize memory use changing object to string
categorical_cols = diabetes.select_dtypes('object').columns.tolist()

# Check column values, correspond to dtypes
for cat in categorical_cols:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Check categorical column pair relations

In [None]:
import itertools

# get categorical column pairs
categorical_col_pair = list(itertools.combinations(categorical_cols, 2))       

# visualize data relations
for pair in categorical_col_pair:
    print(f"\n{pair[0]} distribution per {pair[1]}")
    print(f"{diabetes.groupby(pair[0])[pair[1]].value_counts(dropna= False).unstack().fillna(0)}")

### Save preprocessed file

In [133]:
import os

# create folder
tmp_folder = "./tmp_folder"
os.makedirs(tmp_folder, exist_ok=True)

# save data
diabetes.to_parquet(os.path.join(tmp_folder,"refined_file.parquet"),engine="pyarrow",index=False)

## CORRECT INBALANCES

Examine the data distribution for potential imbalances, which will help determine the appropriate corrective actions.

### Visualize data distribution

Categorical data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# create folder
tmp_folder = "./tmp_folder"
diabetes = pd.read_parquet(os.path.join(tmp_folder,"generalized_file.parquet"),engine="pyarrow")

# Select categorical columns
categorical_cols = diabetes.select_dtypes(include='object')

# loop columns
for col in categorical_cols:
    # plot with col name as title
    fig, ax = plt.subplots(figsize=(10, 5))    
    sns.countplot(data=diabetes, x=col, ax=ax)
    ax.set_title(col)
    
    # Set y-axis limits
    max_count = diabetes[col].value_counts().max()
    ax.set_ylim(0, max_count * 1.1)  # Adding 10% margin above the max count
    
    # Set x-axis ticks
    x_ticks = diabetes[col].value_counts().index
    ax.set_xticks(range(len(x_ticks)))
    ax.set_xticklabels(x_ticks, rotation=45, ha='right')

    # adjust & show
    plt.tight_layout()
    plt.show()

Numerical data.

In [None]:
# Select continuous columns (int64 type)
continuous_columns = diabetes.select_dtypes(include='int64')

# Loop through continuous columns
for col in continuous_columns:
    # Plot histogram
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.histplot(data=diabetes, x=col, ax=ax, bins=30, alpha=0.5)  
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

    # adjust x-ticks
    ax.set_xticks(range(int(diabetes[col].min()), int(diabetes[col].max()) + 1, 2))  
    
    # Adjust & show the plot
    plt.tight_layout()
    plt.show()

### Detect imbalanced columns

In [None]:
def detect_imbalanced_columns(data, threshold=0.1):
    """
    Detect columns with imbalanced classes based on the given threshold.
    
    Parameters:
        data (pd.DataFrame): The DataFrame to check for imbalanced columns.
        threshold (float): The proportion threshold to detect imbalance. 
                           Default is 0.1 (10%). A class is considered imbalanced
                           if its proportion is below this threshold or above 1 - threshold.
                           
    Returns:
        imbalanced_cols (list): List of column names with imbalanced data.
    """
    imbalanced_cols = []
    
    for col in data.columns:
        value_counts = data[col].value_counts(normalize=True, dropna=False)     
        
        # Check if any class proportion is below the threshold or above (1 - threshold)
        if any((value_counts < threshold) | (value_counts > (1 - threshold))):
            imbalanced_cols.append(col)
    
    return imbalanced_cols

def show_values_proportions(df, col):
    """
    Create dataframe that helps to visualize value_counts and data proportion

    Parameters:
        df (pd.DataFrame): The DataFrame to filter data
        col (str): column to filter
                           
    Returns:
        null
    """
     
    # Combine proportions into a DataFrame for easy comparison
    data =  pd.DataFrame({
        'Values': df[col].value_counts(dropna=False),
        'Proportions': df[col].value_counts(normalize = True, dropna=False)
    }).fillna(0)

    # print data
    print(data)

# recover again data
diabetes = pd.read_parquet(os.path.join(tmp_folder,"refined_file.parquet"),engine="pyarrow")

# find invalances and print results
imbalanced_columns = detect_imbalanced_columns(diabetes, threshold= 0.1)
if imbalanced_columns:
    print(f"Out of {len(diabetes.columns)} columns, {len(imbalanced_columns)} are imbalanced.")
    for col in imbalanced_columns:
        print(f"\nColumn: {col}")
        # print values
        show_values_proportions(diabetes, col)

### Apply generalization

The numerical columns such as `number_emergency`, `num_procedures`, `number_inpatient`, and `number_outpatient` show a large number of instances with no visits, while the rest are spread across their respective values. Generalizing them into 2 groups (whether there have been emergency visits or not) seems like a good action.

Similarly, the generalization or grouping of other variables such as `age` and `readmitted` can help better balance the proportions of the data.

**Readmitted column**

Generalized to `Yes` and `No` data.

In [None]:
# copy original data to manipulate
df1  = diabetes.copy()

# COLUMN: "readmitted" 
col = "readmitted"
df1.loc[df1[col] != "NO", col] = "Yes"
df1.loc[df1[col] == "NO", col] = "No"

# visualize changes
print("Real:")
show_values_proportions(diabetes,col)
print("\nNew:")
show_values_proportions(df1,col)

**Age column**

Generalized from 10 groups to 5.

In [None]:
"""
COLUMN: "age" 
GENERALIZE TO
[0-20) == [0-10) & [10-20)
[20-40) == [20-30) & [30-40)
[40-60) == [40-50) & [50-60)
[60-80) == [60-70) & [70-80)
[80-100) == [80-90) & [90-100)
"""
col = "age"
df1.loc[(df1[col] == "[0-10)") | (df1[col] == "[10-20)"),col] = "[0-20)"
df1.loc[(df1[col] == "[20-30)") | (df1[col] == "[30-40)"),col]= "[20-40)"
df1.loc[(df1[col] == "[40-50)") | (df1[col] == "[50-60)"),col] = "[40-60)"
df1.loc[(df1[col] == "[60-70)") | (df1[col] == "[70-80)"),col] = "[60-80)"
df1.loc[(df1[col] == "[80-90)") | (df1[col] == "[90-100)"),col] = "[80-100)"

# visualize changes
print("Real:")
show_values_proportions(diabetes,col)
print("\nNew:")
show_values_proportions(df1,col)

**Number_outpatient column**

Generalized to categorical `Yes` and `No` data. The column name changes to `outpatient`.

In [None]:
# Generalize number_outpatient in two groups
col = "number_outpatient"
col1 = "outpatient"
df1.loc[df1[col] != 0, col1] = "Yes"
df1.loc[df1[col] == 0, col1] = "No"

# visualize changes
print("Real:")
show_values_proportions(diabetes,col)
print("\nNew:")
show_values_proportions(df1,col1)

**Number_impatient column**

Generalized to categorical `Yes` and `No` data. The column name changes to `inpatient`.

In [None]:
# Generalize number_impatient in two groups
col = "number_inpatient"
col1 = "inpatient"
df1.loc[df1[col] != 0, col1] = "Yes"
df1.loc[df1[col] == 0, col1] = "No"

# visualize changes
print("Real:")
show_values_proportions(diabetes,col)
print("\nNew:")
show_values_proportions(df1,col1)

**Number_procedures column**

Generalized to categorical `Yes` and `No` data. The column name changes to `procedures`.

In [None]:
# Generalize num_procedures in two groups
col = "num_procedures"
col1 = "procedures"
df1.loc[df1[col] != 0, col1] = "Yes"
df1.loc[df1[col] == 0, col1] = "No"

# visualize changes
print("Real:")
show_values_proportions(diabetes,col)
print("\nNew:")
show_values_proportions(df1,col1)

**Number_emergency column**

Generalized to categorical `Yes` and `No` data. The column name changes to `emergencies`.

In [None]:
# Generalize number_emergency in two groups
col = "number_emergency"
col1 = "emergencies"
df1.loc[df1[col] != 0, col1] = "Yes"
df1.loc[df1[col] == 0, col1] = "No"

# visualize changes
print("Real:")
show_values_proportions(diabetes,col)
print("\nNew:")
show_values_proportions(df1,col1)

### Remove original columns

In [None]:
# drop original columns 
col_list = ["number_emergency","num_procedures","number_inpatient", "number_outpatient"]

print(f"Column length after: {len(df1.columns)} to eliminate: {len(col_list)}")
df1 = df1.drop(col_list, axis=1)
print(f"Column length now: {len(df1.columns)}")

### Save generalized file

In [144]:
# save generalized data
df1.to_parquet(os.path.join(tmp_folder,"generalized_file.parquet"),engine="pyarrow",index=False)

### Balance minority classes

Stratify data and oversample minority classes.

In [None]:
"""from imblearn.over_sampling import RandomOverSampler
import pandas as pd


# set dataframe as reference
df = diabetes.copy()
col = "race"
X = df

# Contar las instancias de cada clase en la columna 'race'
val_counts = X[col].value_counts()

# Determinar la cantidad de instancias de la clase mayor (Caucasian)
max_count = val_counts.max()

# Calcular la cantidad de instancias necesarias para cada clase
sampling_strategy = {}
for val, count in val_counts.items():
    if val in ['Hispanic', 'Asian']:
        sampling_strategy[val] = int(max_count * (count / max_count) * 2)  # Duplicar para Hispanic y Asian

# Inicializar el oversampler con la nueva estrategia de muestreo
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

# Realizar el sobre-muestreo
X_resampled, y_resampled = ros.fit_resample(X, X[col])

# comparar ambos resultados
comparison_df = pd.DataFrame({
        'Real': df[col].value_counts(normalize = True, dropna=False),
        "OverSampled": X_resampled[col].value_counts(normalize = True, dropna=False)
    }).fillna(0)

print (f"Col: {col} \n{comparison_df}")

df = X_resampled"""

One-hot encoding to convert categorical data to numerical.

In [None]:
# One-hot function
def apply_one_hot(df, cols):
    # Apply: one-hot encode categorical variables    
    return pd.get_dummies(df, columns=cols, drop_first=True)

# LOAD data
diabetes = pd.read_parquet(os.path.join(tmp_folder,"generalized_file.parquet"),engine="pyarrow")

# Set target value
X = diabetes.drop(columns='readmitted')
y = diabetes['readmitted']

# get only categorical data
categorical_cols = X.select_dtypes(include=['object']).columns

# call to function
X_encoded = apply_one_hot(X, categorical_cols)
X_encoded

Stratify data and oversample minority classes

In [271]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# split data in stratified mode
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)

# apply data balancing to stratified train data
smote = SMOTE(sampling_strategy = 'minority', random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Combine back into a DataFrame 
balanced_train_df = pd.concat([pd.DataFrame(X_train_balanced), pd.Series(y_train_balanced, name='readmitted')], axis=1)
balanced_train_df

### Reverse one-hot codification to categoricals again

In [None]:
"""#  TODO: Reverse one-hot encoding
categorical_df = pd.DataFrame()

for col in X_encoded.columns:
    # Get the base name of the original categorical column (before one-hot encoding)
    base_name = col.split('_')[0]
    categorical_df[base_name] = X_encoded[X_encoded.columns].idxmax(axis=1).str.replace(base_name + '_', '')

# If needed, drop duplicates to retain only one instance of the original categorical column
categorical_df = categorical_df.drop_duplicates()

# Create a new DataFrame with the original categorical columns
original_categorical_df = pd.concat([categorical_df, X_encoded.drop(columns=X_encoded.columns)], axis=1)"""

### Compare balanced data with real

In [None]:
"""#  TODO: comparar ambos resultados
for col in diabetes.columns:
    comparison_df = pd.DataFrame({
            'Real': diabetes[col].value_counts(normalize = True, dropna=False),
            "Balanced": balanced_train_df[col].value_counts(normalize = True, dropna=False)
        }).fillna(0)

    print (comparison_df)"""

## SAVE BALANCED DATA