# **Classification Challenge**

`Tópicos Especiais em Computação VIII`

Using Random Forests to predict hospital readmissions of diabetic patients

*Luiz Henrique Rigo Faccio*

## **Importing Libraries and loading dataset**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import VarianceThreshold

In [None]:
# Loading informations
folder = "diabetes"
diabetes = pd.read_csv(f'{folder}/diabetic_data.csv')
mapping = pd.read_csv(f'{folder}/IDS_mapping.csv')
admission_type_mapping = mapping[0:7]
discharge_disposition_mapping = mapping[10:40].reset_index(drop=True)
admission_source_mapping = mapping[42:67].reset_index(drop=True)

## **Observing the dataset**

In [None]:
print(diabetes.shape)
print(diabetes.info())
print(diabetes.describe(include='all'))
print()
for c in diabetes.columns:
    print(f"Column {c}", end="\n\t\t")
    print(diabetes[c].unique(), end="\n\n")

## **Data treatment**

The `pre_process_diabetes_data()` function can be used to process new information so that it can be used in the model

In [None]:
def pre_process_diabetes_data(df):
    """
        Args:
            df (pandas DataFrame): Dataset to be processed

        Returns:
            df (pandas DataFrame): Processed dataset

        This function processes diabetes data from the original form to the model-ready form.
    """    

    # Dropping IDs and unnecessary columns and standadizing the missing values

    df.drop(columns=['encounter_id', 'patient_nbr', 'payer_code'], inplace=True)
    df.replace(["?", 'Unknown/Invalid'], pd.NA, inplace=True)

    # Also drppping columns with too little information (Mostly null values)
    
    df.drop(columns=['weight', 'medical_specialty', 'max_glu_serum', 'A1Cresult'], inplace=True)

    # Joining tables

    df = df.join(admission_type_mapping["description"], how='left', on='admission_type_id').rename(columns={"description": "admission_type"}).drop(columns=['admission_type_id'])
    df = df.join(discharge_disposition_mapping["description"], how='left', on='discharge_disposition_id').rename(columns={"description": "discharge_disposition"}).drop(columns=['discharge_disposition_id'])
    df = df.join(admission_source_mapping["description"], how='left', on='admission_source_id').rename(columns={"description": "admission_source"}).drop(columns=['admission_source_id'])

    # Dropping rows with missing values

    df.dropna(axis=0, how='any', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Treating the target 

    y = df["readmitted"].replace({'NO': 0, '>30': 1, '<30': 2})
    df.drop(columns=['readmitted'], inplace=True)

    # Scaling numerical variables

    numerical_cols =df.select_dtypes(include=['int64', 'float64']).columns
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Getting dummies for categorical variables 

    categorical_columns = df.select_dtypes(include=['object']).columns

    temp = pd.DataFrame()
    for column in categorical_columns:
        dummies = pd.get_dummies(df[column], prefix=column, dtype=int)
        temp = pd.concat([temp, dummies], axis=1)
        df.drop(columns=[column], inplace=True)

    # Removing columns with low variance
    
    selector = VarianceThreshold(threshold=0.01)    ## Remove collumns with variance lower than 0.01, that is, those collumns have the same value for 99.9% of the rows
    temp = pd.DataFrame(selector.fit_transform(temp), columns=temp.columns[selector.get_support()])

    final = pd.concat([df, temp, y], axis=1)
    return final

In [None]:
# Processing the dataset

diabetes_processed = pre_process_diabetes_data(diabetes)
diabetes_processed.info(verbose=True, memory_usage=True)

## **The model** 

Two different models were used: RandomForest and Multi Layer Perceptron Classifier (RNN)

Not only a single model was trained, rather, a Grid Serach was executed to find the best combination of parameters for this proble, to each of the two models

In [None]:
def single_model_Random_Forest(processed_data, target):
    """
        Args:
            processed_data (pandas DataFrame): Processed diabetes dataset to be used for training
            target (str): Target variable name

        Returns:
            rf (RandomForestClassifier): Trained Random Forest model

        This function trains a Random Forest model on the processed diabetes dataset, using predefined hyperparameters.
    """    

    # Splitting the data into train and test sets
    X = processed_data.drop(columns=[target])
    y = processed_data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Creating and training the Random Forest model
    rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=8,
        min_samples_leaf=5,
        max_features='sqrt',
        random_state=42
    )
    rf.fit(X_train, y_train),
    print(f"Train score: {rf.score(X_test, y_test)}")

    return rf


In [None]:
def single_model_RNN(processed_data, target):
    """
        Args:
            processed_data (pandas DataFrame): Processed diabetes dataset to be used for training
            target (str): Target variable name

        Returns:
            model (MLPClassifier): Trained MLP model

        This function trains a Multi-Layer Perceptron (MLP) model on the processed diabetes dataset, using predefined hyperparameters.
    """    

    # Splitting the data into train and test sets
    X = processed_data.drop(columns=[target])
    y = processed_data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = MLPClassifier(
        hidden_layer_sizes=(64, 32),   
        activation='relu',             # ('tanh', 'logistic', etc.)
        solver='adam',                 # ('adam', 'sgd', etc.)
        max_iter=230,                  
        random_state=42,
        verbose=True,
        early_stopping=True,
        n_iter_no_change=15,
    )

    model.fit(X_train, y_train)

    accuracy = model.score(X_test, y_test)
    print(f"Acurácia: {accuracy:.4f}")

    return model


In [None]:
# # Splitting Data
# X = diabetes.drop(columns=['readmitted'])
# y = diabetes['readmitted']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# param_grid = {
#     'n_estimators': [100, 200, 300],             
#     'max_depth': [None, 10, 20, 30],             
#     'min_samples_split': [2, 5, 10],             
#     'min_samples_leaf': [1, 2, 4],               
#     'max_features': ['sqrt', 'log2', None],      
#     'bootstrap': [True, False],                  
#     'criterion': ['gini', 'entropy']             
# }

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)


# grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='f1', n_jobs=-1)
# grid_search.fit(X_train, y_train)


# print("\nMelhores parâmetros encontrados:")
# print(grid_search.best_params_)
# print("\nMelhor score de validação cruzada:")
# cv_results_df = pd.DataFrame(grid_search.cv_results_)
# print(cv_results_df[['param_criterion', 'param_max_depth', 'param_min_samples_split', 'mean_test_score', 'std_test_score', 'rank_test_score']])

In [None]:
rf = single_model_Random_Forest(diabetes_processed, "readmitted")
mlp = single_model_RNN(diabetes_processed, "readmitted")