In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, classification_report

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from etl import transform

In [96]:
def plot_corre_heatmap(corr):
    '''
    Definimos una función para ayudarnos a graficar un heatmap de correlación
    '''
    plt.figure(figsize=(16,14))
    sns.heatmap(corr, cbar = True,  square = False, annot=True, fmt= '.2f'
                ,annot_kws={'size': 15},cmap= 'coolwarm')
    plt.xticks(rotation = 45)
    plt.yticks(rotation = 45)
    # Arreglamos un pequeño problema de visualización
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    plt.show()

In [97]:
data = pd.read_csv('../datasets/hospitalizaciones_train.csv')
data2 = pd.read_csv('../datasets/hospitalizaciones_test.csv')

In [98]:
columns = ['Department', 'Ward_Facility_Code', 'doctor_name', 'Age','gender', 'Stay (in days)']
X_Train, X_Test, Y_Train, Y_Test = transform(data, columns, target='Stay (in days)', test_size=0.20, random_state=42, array=True, scalar=True)

In [99]:
print(f'X_Train = {X_Train.shape}, X_Test = {X_Test.shape}\nY_Train = {Y_Train.size},      Y_Test = {Y_Test.size}')

X_Train = (288, 5), X_Test = (73, 5)
Y_Train = 288,      Y_Test = 73


In [None]:
corr = data.corr()
plot_corre_heatmap(corr)

In [8]:
logistic_model = LogisticRegression()
logistic_model.fit(X_Train, Y_Train)
Y_Pred_logistic = logistic_model.predict(X_Test)

In [30]:
svc_model = SVC(kernel='linear', random_state=42)
svc_model.fit(X_Train, Y_Train)
Y_Pred_svc = svc_model.predict(X_Test)

In [41]:
dtc_model = DecisionTreeClassifier(max_depth = 4, criterion='entropy', random_state=42)
dtc_model.fit(X_Train, Y_Train)
Y_Pred_dtc = dtc_model.predict(X_Test)

In [100]:
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(X_Train, Y_Train)
Y_Pred_knn = knn_model.predict(X_Test)

In [None]:
print(f'Logistic:       Recal = {round(recall_score(Y_Test, Y_Pred_logistic), 2)},      Accuracy = {round(accuracy_score(Y_Test, Y_Pred_logistic), 2)}')
print(f'SVC:            Recal = {round(recall_score(Y_Test, Y_Pred_svc), 2)},       Accuracy = {round(accuracy_score(Y_Test, Y_Pred_svc), 2)}')
print(f'DecisionTree:   Recal = {round(recall_score(Y_Test, Y_Pred_dtc), 2)},       Accuracy = {round(accuracy_score(Y_Test, Y_Pred_dtc), 2)}')
print(f'KNN:            Recal = {round(recall_score(Y_Test, Y_Pred_knn), 2)},      Accuracy = {round(accuracy_score(Y_Test, Y_Pred_knn), 2)}\n')
print(classification_report(Y_Test, Y_Pred_knn))

In [104]:
columns_data2 = ['Department', 'Ward_Facility_Code', 'doctor_name', 'Age','gender']
eval = transform(data2, columns_data2, array=True, scalar=True)

In [108]:
predition = pd.DataFrame(knn_model.predict(eval))
predition.columns = ['pred']
predition.to_csv('JoseAcevedo6', index=False)

In [None]:
cm = confusion_matrix(Y_Test, Y_Pred_logistic)

fig, ax = plt.subplots(figsize=(8,4))
ax.matshow(cm)
plt.title('Matriz de Confusión', fontsize=20)
plt.ylabel('Etiqueta Verdadera', fontsize=15)
plt.xlabel('Etiqueta Predicha', fontsize=15)
for (i, j), z in np.ndenumerate(cm):
    ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')

In [None]:
train_prec =  []
eval_prec = []
max_deep_list = list(range(2, 15))

for deep in max_deep_list:
    clf_3 = DecisionTreeClassifier(max_depth=deep, criterion='entropy')
    clf_3.fit(X_Train, Y_Train)
    train_prec.append(clf_3.score(X_Train, Y_Train))
    eval_prec.append(clf_3.score(X_Test, Y_Test))

plt.figure(figsize=(12,7))
plt.plot(max_deep_list, train_prec, color='r', label='Set de entrenamiento')
plt.plot(max_deep_list, eval_prec, color='b', label='Set de testeo')
plt.title('Gráfico de ajuste del árbol de decision', fontsize = 15)
plt.legend()
plt.ylabel('Precisión')
plt.xlabel('Profundidad')
plt.show()