In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, RepeatedKFold, RepeatedStratifiedKFold, StratifiedKFold

import sklearn.metrics as metrics

import warnings

warnings.filterwarnings("ignore")

In [None]:
##### Importo los datos
df = pd.read_csv("hospital_train.csv")

##### Reemplazo los nombres de las columnas
column_names = pd.read_csv("columns_meaning.csv", index_col = 0)
column_names_dict = column_names.to_dict()
df.columns = column_names_dict["Description"].values()

##### Paso las variables nominales a dummies
df = pd.get_dummies(df, prefix = ["hospital_type", "hospital_city", "hospital_region", "department", "ward_type", "ward_facility"], columns = ["Unique code for the type of Hospital", "City Code of the Hospital", "Region Code of the Hospital", "Department overlooking the case", "Code for the Ward type", "Code for the Ward Facility"])

##### Mapeo las variables ordinales
# Admission Type registered by the Hospital
dict_1 = {
    "Emergency" : 1,
    "Trauma" : 2,
    "Urgent" : 3
}

# Severity of the illness recorded at the time of admission
dict_2 = {
    "Minor" : 1,
    "Moderate" : 2,
    "Extreme" : 3
}

# Age of the patient
dict_3 = {
    "0-10" : 1,
    "11-20" : 2,
    "21-30" : 3,
    "31-40" : 4,
    "41-50" : 5,
    "51-60" : 6,
    "61-70" : 7,
    "71-80" : 8,
    "81-90" : 9,
    "91-100" : 10,
}

# Stay Days by the patient
dict_4 = {
    "0-10" : 1,
    "11-20" : 2,
    "21-30" : 3,
    "31-40" : 4,
    "41-50" : 5,
    "51-60" : 6,
    "61-70" : 7,
    "71-80" : 8,
    "81-90" : 9,
    "91-100" : 10,
    "More than 100 Days" : 11
}

df["admission_type_enc"] = df["Admission Type registered by the Hospital"].map(dict_1)
df["illness_severity_enc"] = df["Severity of the illness recorded at the time of admission"].map(dict_2)
df["admission_type_enc"] = df["Age of the patient"].map(dict_3)
df["target_enc"] = df["Stay Days by the patient"].map(dict_4)

##### Elimino las variables que no necesito
df.drop(["Admission Type registered by the Hospital", "Severity of the illness recorded at the time of admission", "Age of the patient", "Stay Days by the patient"], axis = 1, inplace = True)

##### Elimino los NaNs
#df.dropna(inplace = True)

# Condition of Bed in the Ward 
df.iloc[:, 3] = df.iloc[:, 3].fillna(df.iloc[:, 3].mode()[0])
# City Code for the patient
df.iloc[:, 5] = df.iloc[:, 5].fillna(df.iloc[:, 5].mode()[0])

df.head(2)

In [None]:
# Independent variables
X = np.array(df.drop("target_enc", axis = 1))

# Dependent variable
y = np.array(df["target_enc"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

# Cross validation
#kfold = RepeatedKFold(n_splits = 10, n_repeats = 1, random_state = 42)
kfold = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

# Model
model = LogisticRegression(max_iter = 1000, n_jobs = -1)

In [None]:
train_scores = []
val_scores = []
count = 1

for (train, val) in kfold.split(X_train, y_train):

    #print(train)
    #print(val)

    # Train-Validation sets
    x_t, y_t = X_train[train], y_train[train]
    x_v, y_v = X_train[val], y_train[val]


    # Internal structure
    y_t_unique, y_t_counts = np.unique(y_t, return_counts=True)
    y_v_unique, y_v_counts = np.unique(y_v, return_counts=True)

    # Training
    model.fit(x_t, y_t)

    # Scores
    train_score = model.score(x_t, y_t)
    val_score = model.score(x_v, y_v)

    train_scores.append(train_score)
    val_scores.append(val_score)

    print(f"Model {count}")
    print("-" * 25)
    print("Set structure:")
    print("Train structure:", dict(zip(y_t_unique, y_t_counts / len(y_t))))
    print("Validation structure:", dict(zip(y_v_unique, y_v_counts / len(y_v))))
    print("-" * 25)
    print("train score:", train_score)
    print("test score:", val_score)
    print("#" * 75)

    count += 1

In [None]:
# Training with full train_data
model.fit(X_train, y_train)

# Internal structure
y_train_unique, y_train_counts = np.unique(y_train, return_counts=True)
y_test_unique, y_test_counts = np.unique(y_test, return_counts=True)

# Scores
train_score_ = model.score(X_train, y_train)
test_score_ = model.score(X_test, y_test)

# Prediction
prediction = model.predict(X_test)

# Confusion matrix
cm = metrics.confusion_matrix(y_test, prediction)

print("Train set model:")
print("#" * 50)
print("\nTrain structure:", dict(zip(y_train_unique, y_train_counts / len(y_train) * 100)))
print("Train score:", train_score_)
print("#" * 50)
print("\nTest set model:")
print("Validation structure:", dict(zip(y_test_unique, y_test_counts / len(y_test) * 100)))
print("Test score:", test_score_)
print("#" * 50)
print("\nConfusion matrix\n", cm)

In [None]:
plt.figure(figsize = (12, 12))

sns.heatmap(cm / len(y_test), annot = True, linewidths = .5, square = True, cmap = "Blues_r")

plt.xticks(range(0, 11), range(1, 12))
plt.yticks(range(0, 11), range(1, 12))

plt.xlabel("Actual label")
plt.ylabel("Predicted label")

plt.title(f"Test score: {test_score_}")

plt.show()

# Submission

In [None]:
X_pred = pd.read_csv("data/hospital_test.csv")

column_names = pd.read_csv("data/columns_meaning.csv", index_col = 0)[:-1]
column_names_dict = column_names.to_dict()
X_pred.columns = column_names_dict["Description"].values()

##### Paso las variables nominales a dummies
X_pred = pd.get_dummies(X_pred, prefix = ["hospital_type", "hospital_city", "hospital_region", "department", "ward_type", "ward_facility"], columns = ["Unique code for the type of Hospital", "City Code of the Hospital", "Region Code of the Hospital", "Department overlooking the case", "Code for the Ward type", "Code for the Ward Facility"])

##### Mapeo las variables ordinales
# Admission Type registered by the Hospital
dict_1 = {
    "Emergency" : 1,
    "Trauma" : 2,
    "Urgent" : 3
}

# Severity of the illness recorded at the time of admission
dict_2 = {
    "Minor" : 1,
    "Moderate" : 2,
    "Extreme" : 3
}

# Age of the patient
dict_3 = {
    "0-10" : 1,
    "11-20" : 2,
    "21-30" : 3,
    "31-40" : 4,
    "41-50" : 5,
    "51-60" : 6,
    "61-70" : 7,
    "71-80" : 8,
    "81-90" : 9,
    "91-100" : 10,
}

# Stay Days by the patient
dict_4 = {
    "0-10" : 1,
    "11-20" : 2,
    "21-30" : 3,
    "31-40" : 4,
    "41-50" : 5,
    "51-60" : 6,
    "61-70" : 7,
    "71-80" : 8,
    "81-90" : 9,
    "91-100" : 10,
    "More than 100 Days" : 11
}

X_pred["admission_type_enc"] = X_pred["Admission Type registered by the Hospital"].map(dict_1)
X_pred["illness_severity_enc"] = X_pred["Severity of the illness recorded at the time of admission"].map(dict_2)
X_pred["admission_type_enc"] = X_pred["Age of the patient"].map(dict_3)

##### Elimino las variables que no necesito
X_pred.drop(["Admission Type registered by the Hospital", "Severity of the illness recorded at the time of admission", "Age of the patient"], axis = 1, inplace = True)

# Condition of Bed in the Ward 
X_pred.iloc[:, 3] = X_pred.iloc[:, 3].fillna(df.iloc[:, 3].mode()[0])
# City Code for the patient
X_pred.iloc[:, 5] = X_pred.iloc[:, 5].fillna(df.iloc[:, 5].mode()[0])

X_pred.head()

In [None]:
X_pred.info()

In [None]:
dict_5 = {
    1 : "0-10",
    2 : "11-20",
    3 : "21-30",
    4 : "31-40",
    5 : "41-50",
    6 : "51-60",
    7 : "61-70",
    8 : "71-80",
    9 : "81-90",
    10 : "91-100",
    11 : "More than 100 Days"
}

to_submit = X_pred
to_submit["prediction"] = model.predict(X_pred)

to_submit = to_submit[["Case_ID registered in Hospital", "prediction"]]
to_submit.columns = ["id", "days"]
to_submit.days = to_submit.days.map(dict_5)
to_submit.set_index("id", inplace = True)

to_submit

In [None]:
to_submit.to_csv("submissions/submission2.csv")