In [101]:
import pandas as pd
import numpy as np
import itertools
import tensorflow as tf
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.svm import SVC
from scipy.stats import norm, mode

# Carga de datos y separacion en train, val y test

In [2]:
raw_data = pd.read_csv("data_titanic_proyecto.csv")
raw_data

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex,passenger_survived
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S,Lower,M,N
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,Upper,F,Y
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S,Lower,F,Y
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S,Upper,F,Y
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S,Lower,M,N
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,S,Middle,M,N
887,888,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,S,Upper,F,Y
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,,S,Lower,F,N
889,890,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,C,Upper,M,Y


In [3]:
# Quedanonos unicamente con features utiles
trimmed_data = raw_data[["Age", "SibSp", "Parch", "passenger_class", "passenger_sex", "passenger_survived"]]
trimmed_data

Unnamed: 0,Age,SibSp,Parch,passenger_class,passenger_sex,passenger_survived
0,22.0,1,0,Lower,M,N
1,38.0,1,0,Upper,F,Y
2,26.0,0,0,Lower,F,Y
3,35.0,1,0,Upper,F,Y
4,35.0,0,0,Lower,M,N
...,...,...,...,...,...,...
886,27.0,0,0,Middle,M,N
887,19.0,0,0,Upper,F,Y
888,,1,2,Lower,F,N
889,26.0,0,0,Upper,M,Y


In [4]:
trimmed_data.isna().sum()

Age                   177
SibSp                   0
Parch                   0
passenger_class         0
passenger_sex           0
passenger_survived      0
dtype: int64

Tenemos un problema, la edad 177 NaNs, la cual puede ser un indicador util, tiene NaNs, es necesario imputar debido a que tenemos muy poca data con la cual trabajar (Menos de 900 registros) y no nos conviene "tirar" la data

In [5]:
trimmed_data[trimmed_data["Age"].isna()].describe(include="all")

Unnamed: 0,Age,SibSp,Parch,passenger_class,passenger_sex,passenger_survived
count,0.0,177.0,177.0,177,177,177
unique,,,,3,2,2
top,,,,Lower,M,N
freq,,,,136,124,125
mean,,0.564972,0.180791,,,
std,,1.626316,0.534145,,,
min,,0.0,0.0,,,
25%,,0.0,0.0,,,
50%,,0.0,0.0,,,
75%,,0.0,0.0,,,


Podemos ver que la gran mayoria de estos son personas que no tienen padres/hijos abordo (parch) pero tienden a tener algun hermano o ninguno. Son ademas predominantemente hombres de clase baja, asi que se hara una imputacion usando la media de los pasajeros que cumplen estas condiciones

In [6]:
imputed_data = trimmed_data.copy()
imputed_data["Age"] = imputed_data["Age"].fillna(
trimmed_data[  (trimmed_data["SibSp"]<=1) 
             & (trimmed_data["Parch"]==0) 
             & (trimmed_data["passenger_class"]=="Lower") 
             & (trimmed_data["passenger_sex"] == "M") 
             & (~trimmed_data["Age"].isna())]["Age"].mean()
)
imputed_data

Unnamed: 0,Age,SibSp,Parch,passenger_class,passenger_sex,passenger_survived
0,22.000000,1,0,Lower,M,N
1,38.000000,1,0,Upper,F,Y
2,26.000000,0,0,Lower,F,Y
3,35.000000,1,0,Upper,F,Y
4,35.000000,0,0,Lower,M,N
...,...,...,...,...,...,...
886,27.000000,0,0,Middle,M,N
887,19.000000,0,0,Upper,F,Y
888,28.821078,1,2,Lower,F,N
889,26.000000,0,0,Upper,M,Y


In [7]:
imputed_data.isna().sum()

Age                   0
SibSp                 0
Parch                 0
passenger_class       0
passenger_sex         0
passenger_survived    0
dtype: int64

Usando one hot encoding para variables categoricas (se descarta un nivel para el feature de sexo, pues es binario)

In [8]:
encoded_data = imputed_data.copy()
encoded_data["passenger_sex"] =  np.where(encoded_data["passenger_sex"] == "M", 1, 0)
encoded_data["passenger_survived"] = np.where(encoded_data["passenger_survived"] == "Y", 1, 0)
encoded_data

Unnamed: 0,Age,SibSp,Parch,passenger_class,passenger_sex,passenger_survived
0,22.000000,1,0,Lower,1,0
1,38.000000,1,0,Upper,0,1
2,26.000000,0,0,Lower,0,1
3,35.000000,1,0,Upper,0,1
4,35.000000,0,0,Lower,1,0
...,...,...,...,...,...,...
886,27.000000,0,0,Middle,1,0
887,19.000000,0,0,Upper,0,1
888,28.821078,1,2,Lower,0,0
889,26.000000,0,0,Upper,1,1


In [9]:
encoded_data = pd.get_dummies(encoded_data, columns = ["passenger_class"])
encoded_data

Unnamed: 0,Age,SibSp,Parch,passenger_sex,passenger_survived,passenger_class_Lower,passenger_class_Middle,passenger_class_Upper
0,22.000000,1,0,1,0,1,0,0
1,38.000000,1,0,0,1,0,0,1
2,26.000000,0,0,0,1,1,0,0
3,35.000000,1,0,0,1,0,0,1
4,35.000000,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...
886,27.000000,0,0,1,0,0,1,0
887,19.000000,0,0,0,1,0,0,1
888,28.821078,1,2,0,0,1,0,0
889,26.000000,0,0,1,1,0,0,1


In [10]:
encoded_data.dtypes

Age                       float64
SibSp                       int64
Parch                       int64
passenger_sex               int32
passenger_survived          int32
passenger_class_Lower       uint8
passenger_class_Middle      uint8
passenger_class_Upper       uint8
dtype: object

Ahora que tenemos puramente data numerica, podemos crear el split de train, val, test

In [11]:
X, y = encoded_data.loc[:, encoded_data.columns != 'passenger_survived'].to_numpy(), encoded_data["passenger_survived"].to_numpy()
X

array([[22.        ,  1.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [38.        ,  1.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [26.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [28.82107843,  1.        ,  2.        , ...,  1.        ,
         0.        ,  0.        ],
       [26.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [32.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ]])

In [12]:
# Necesitaoms un scaler para poder transformar las edades
age_scaler = preprocessing.StandardScaler()
age_scaler.fit(encoded_data["Age"].to_numpy().reshape(-1,1))

sib_scaler = preprocessing.StandardScaler()
sib_scaler.fit(encoded_data["SibSp"].to_numpy().reshape(-1,1))

par_scaler = preprocessing.StandardScaler()
par_scaler.fit(encoded_data["Parch"].to_numpy().reshape(-1,1))

X[:,0] = age_scaler.transform(X[:,0].reshape(-1,1)).reshape(-1)
X[:,1] = sib_scaler.transform(X[:,1].reshape(-1,1)).reshape(-1)
X[:,2] = par_scaler.transform(X[:,2].reshape(-1,1)).reshape(-1)
X

array([[-0.5788475 ,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.65197492,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [-0.2711419 , -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.05412649,  0.43279337,  2.00893337, ...,  1.        ,
         0.        ,  0.        ],
       [-0.2711419 , -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.19041651, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ]])

In [13]:
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

Creando splits de datos originales

In [14]:
X_pre_train, X_test, y_pre_train, y_test = train_test_split(X, y, train_size=0.80, random_state=16004971)

X_train, X_val, y_train, y_val = train_test_split(X_pre_train, y_pre_train, train_size=0.80, random_state=2022)
X_train.shape

(569, 7)

Creando bootstraps con la misma cantidad de observaciones que el set de entrenamiento, debido a que existe reemplazo garantizamos que no se incluira el dataset completo

In [40]:
X_train_m1, y_train_m1 = resample(X_train, y_train, n_samples = 569, random_state = 1337)
X_train_m2, y_train_m2 = resample(X_train, y_train, n_samples = 569, random_state = 42)
X_train_m3, y_train_m3 = resample(X_train, y_train, n_samples = 569, random_state = 24)
X_train_m4, y_train_m4 = resample(X_train, y_train, n_samples = 569, random_state = 25)
X_train_m4.shape

(569, 7)

# Entrenando modelos

In [67]:
# Tambien se definen nombres de columna y lista de corridas
col_names = ["name", "model_params", "train accuracy", "val accuracy", "precision", "recall", "f1 score"]
runs = []

## Modelo 1 - Decision Tree de sklearn

In [17]:
lemon_tree = DecisionTreeClassifier(random_state=21) # Arbol por defecto, sin hyperparameter tuning

In [18]:
lemon_tree.fit(X_train_m1, y_train_m1)
accuracy_score(y_train_m1, lemon_tree.predict(X_train_m1))

0.9543057996485061

Funcion de entrenamiento automatico

In [54]:
def tree_trainer(X_train, y_train, X_test, y_test, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None):
    name = f"decisiontree_crit={criterion}_maxdepth={max_depth}_minsamplessplit={min_samples_split}_minsamplesleaf={min_samples_leaf}_maxfeatures={max_features}"
    tree = DecisionTreeClassifier(random_state=21, 
                                  criterion=criterion, 
                                  max_depth=max_depth, 
                                  min_samples_split=min_samples_split, 
                                  min_samples_leaf=min_samples_leaf,
                                  max_features=max_features)
    tree.fit(X_train, y_train)
    train_accuracy = accuracy_score(y_train, tree.predict(X_train))
    test_accuracy = accuracy_score(y_test, tree.predict(X_test))
    return [name, train_accuracy, test_accuracy, *precision_recall_fscore_support(y_train, tree.predict(X_train), average="binary")[0:3]]

Pruebas de parametros

In [55]:
criterion = ["gini", "entropy"]
max_depth = [None, 2, 3]
min_samples_split = [2, 10, 15]
min_samples_leaf = [1, 10, 20]
max_features = [None, "auto", "sqrt", "log2", 3]

tree_runs = []

for criterion, max_depth, min_samples_split, min_samples_leaf, max_features in list(itertools.product(criterion, max_depth, min_samples_split, min_samples_leaf, max_features)):
    tree_runs.append(tree_trainer(X_train_m1, y_train_m1, X_val, y_val, criterion, max_depth, min_samples_split, min_samples_leaf, max_features))

tree_runs

[['decisiontree_crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=None',
  0.9543057996485061,
  0.6363636363636364,
  0.9411764705882353,
  0.9491525423728814,
  0.9451476793248945],
 ['decisiontree_crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=auto',
  0.9543057996485061,
  0.6643356643356644,
  0.9411764705882353,
  0.9491525423728814,
  0.9451476793248945],
 ['decisiontree_crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=sqrt',
  0.9543057996485061,
  0.6643356643356644,
  0.9411764705882353,
  0.9491525423728814,
  0.9451476793248945],
 ['decisiontree_crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=log2',
  0.9543057996485061,
  0.6643356643356644,
  0.9411764705882353,
  0.9491525423728814,
  0.9451476793248945],
 ['decisiontree_crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=3',
  0.9543057996485061,
  0.6503496503496503,
  0.9411764705882353,
  0.9491525423728814,
 

## Modelo 2 - SVM

In [21]:
support_vector_classifier = SVC(random_state = 27) #Clasificador por defecto
support_vector_classifier.fit(X_train_m2, y_train_m2)
accuracy_score(y_train_m2, support_vector_classifier.predict(X_train_m2))

0.8646748681898067

Funcion de entrenamiento automatico

In [56]:
def svc_trainer(X_train, y_train, X_test, y_test, C = 1, kernel = "rbf", shrinking = True):
    name = f"svc_C={C}_kernel={kernel}_shrinking={shrinking}"
    svc = SVC(random_state = 27,
             C = C,
             kernel = kernel,
             shrinking = shrinking)
    svc.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, svc.predict(X_train))
    test_acc = accuracy_score(y_test, svc.predict(X_test))
    
    return [name, train_acc, test_acc, *precision_recall_fscore_support(y_train, svc.predict(X_train), average="binary")[0:3]]

In [57]:
C = [0.1, 0.5, 1, 2, 5, 10]
kernel = ["linear", "poly", "rbf", "sigmoid"]
shrinking = [True, False]

svc_runs = []

for C, kernel, shrinking in list(itertools.product(C, kernel, shrinking)):
    svc_runs.append(svc_trainer(X_train_m2, y_train_m2, X_val, y_val, C, kernel, shrinking))
    
svc_runs

[['svc_C=0.1_kernel=linear_shrinking=True',
  0.8224956063268892,
  0.7902097902097902,
  0.75,
  0.7230769230769231,
  0.7362924281984333],
 ['svc_C=0.1_kernel=linear_shrinking=False',
  0.8224956063268892,
  0.7902097902097902,
  0.75,
  0.7230769230769231,
  0.7362924281984333],
 ['svc_C=0.1_kernel=poly_shrinking=True',
  0.7012302284710018,
  0.6293706293706294,
  0.9629629629629629,
  0.13333333333333333,
  0.2342342342342342],
 ['svc_C=0.1_kernel=poly_shrinking=False',
  0.7012302284710018,
  0.6293706293706294,
  0.9629629629629629,
  0.13333333333333333,
  0.2342342342342342],
 ['svc_C=0.1_kernel=rbf_shrinking=True',
  0.8400702987697716,
  0.7762237762237763,
  0.8132530120481928,
  0.6923076923076923,
  0.7479224376731302],
 ['svc_C=0.1_kernel=rbf_shrinking=False',
  0.8400702987697716,
  0.7762237762237763,
  0.8132530120481928,
  0.6923076923076923,
  0.7479224376731302],
 ['svc_C=0.1_kernel=sigmoid_shrinking=True',
  0.7082601054481547,
  0.7132867132867133,
  0.6218487394

## Modelo 3 - Naive Bayes

(Notar que este modelo es solo para este dataset)

In [24]:
chance_one = np.where(y_train_m3==1)[0].size/y_train_m3.size
chance_one

0.40421792618629176

In [25]:
num_ones = np.where(y_train_m3==1)[0].size
num_ones

230

In [26]:
num_zeroes = np.where(y_train_m3==0)[0].size
num_zeroes

339

In [27]:
chance_zero = np.where(y_train_m3==0)[0].size/y_train_m3.size
chance_zero

0.5957820738137083

In [28]:
#Definiendo un set con X y y para simplificar calculos
naive_set = np.hstack((X_train_m3, y_train_m3.reshape(-1,1)))
naive_set.shape

(569, 8)

In [29]:
probs = {}

In [30]:
#Variables continuas
for i in range(0,3):
    mean_yes = np.mean(naive_set[np.where(naive_set[:,7] == 1),i])
    sd_yes = np.std(naive_set[np.where(naive_set[:,7] == 1),i])
    
    mean_no = np.mean(naive_set[np.where(naive_set[:,7] == 0),i])
    sd_no = np.std(naive_set[np.where(naive_set[:,7] == 0),i])
    
    probs[i] = ((mean_yes, sd_yes), (mean_no, sd_no))
    
probs    

{0: ((-0.1674172039048489, 1.0318777124611627),
  (0.10683096211757095, 0.975304325221767)),
 1: ((-0.13922442336327862, 0.46951537904505225),
  (0.07414036479827246, 1.2181524879724193)),
 2: ((0.16316904970948762, 0.9939223418379433),
  (0.027974112948938274, 1.1070459045930954))}

In [31]:
#Variables categoricas
for i in range(3, 7):
    chance_yes = np.where((naive_set[:,i] == 1) & (naive_set[:,7] == 1))[0].size/num_ones
    chance_no = np.where((naive_set[:,i] == 1) & (naive_set[:,7] == 0))[0].size/num_zeroes
    probs[i] = (chance_yes, chance_no)
    
probs

{0: ((-0.1674172039048489, 1.0318777124611627),
  (0.10683096211757095, 0.975304325221767)),
 1: ((-0.13922442336327862, 0.46951537904505225),
  (0.07414036479827246, 1.2181524879724193)),
 2: ((0.16316904970948762, 0.9939223418379433),
  (0.027974112948938274, 1.1070459045930954)),
 3: (0.2956521739130435, 0.8407079646017699),
 4: (0.30869565217391304, 0.6784660766961652),
 5: (0.2608695652173913, 0.16224188790560473),
 6: (0.43043478260869567, 0.1592920353982301)}

In [32]:
# se añaden las prior probabilities al dict de probs
probs["chance_one"] = chance_one
probs["chance_zero"] = chance_zero
probs

{0: ((-0.1674172039048489, 1.0318777124611627),
  (0.10683096211757095, 0.975304325221767)),
 1: ((-0.13922442336327862, 0.46951537904505225),
  (0.07414036479827246, 1.2181524879724193)),
 2: ((0.16316904970948762, 0.9939223418379433),
  (0.027974112948938274, 1.1070459045930954)),
 3: (0.2956521739130435, 0.8407079646017699),
 4: (0.30869565217391304, 0.6784660766961652),
 5: (0.2608695652173913, 0.16224188790560473),
 6: (0.43043478260869567, 0.1592920353982301),
 'chance_one': 0.40421792618629176,
 'chance_zero': 0.5957820738137083}

Se define una funcion que puede predecir clases, recibiendo un diccionario parecido a probs

In [64]:
def naive_predict(X, probs):
    #Se construiran dos ndarrays de una columa para cada feature con un total de filas igual a la cantidad de filas en X
    #Estos ndarrays seran las probabilidades de P(xi|y)
    #Los ndarrays se combinaran al final dos diferentes ndarrays para hacer la multiplicacion por filas
    
    f0y = lambda x: norm.pdf(x, probs[0][0][0], probs[0][0][1])
    feat_0_yes = f0y(X[:, 0]).reshape(-1,1)
    
    f0n = lambda x: norm.pdf(x, probs[0][1][0], probs[0][1][1])
    feat_0_no = f0y(X[:, 0]).reshape(-1,1)
    
    f1y = lambda x: norm.pdf(x, probs[1][0][0], probs[1][0][1])
    feat_1_yes = f1y(X[:, 1]).reshape(-1,1)
    
    f1n = lambda x: norm.pdf(x, probs[1][1][0], probs[1][1][1])
    feat_1_no = f0y(X[:, 1]).reshape(-1,1)
    
    f2y = lambda x: norm.pdf(x, probs[2][0][0], probs[2][0][1])
    feat_2_yes = f0y(X[:, 2]).reshape(-1,1)
    
    f2n = lambda x: norm.pdf(x, probs[2][1][0], probs[2][1][1])
    feat_2_no = f0y(X[:, 2]).reshape(-1,1)
    
    feat_3_yes = np.copy(X[:, 3])
    feat_3_yes[feat_3_yes == 1] = feat_3_yes[feat_3_yes == 1]*probs[3][0]
    feat_3_yes[feat_3_yes == 0] = 1
    feat_3_yes = feat_3_yes.reshape(-1,1)
    
    feat_3_no = np.copy(X[:, 3])
    feat_3_no[feat_3_no == 1] = feat_3_no[feat_3_no == 1]*probs[3][1]
    feat_3_no[feat_3_no == 0] = 1
    feat_3_no = feat_3_no.reshape(-1,1)
    
    feat_4_yes = np.copy(X[:, 4])
    feat_4_yes[feat_4_yes == 1] = feat_4_yes[feat_4_yes == 1]*probs[4][0]
    feat_4_yes[feat_4_yes == 0] = 1
    feat_4_yes = feat_4_yes.reshape(-1,1)
    
    feat_4_no = np.copy(X[:, 4])
    feat_4_no[feat_4_no == 1] = feat_4_no[feat_4_no == 1]*probs[4][1]
    feat_4_no[feat_4_no == 0] = 1
    feat_4_no = feat_4_no.reshape(-1,1)
    
    feat_5_yes = np.copy(X[:, 5])
    feat_5_yes[feat_5_yes == 1] = feat_5_yes[feat_5_yes == 1]*probs[5][0]
    feat_5_yes[feat_5_yes == 0] = 1
    feat_5_yes = feat_5_yes.reshape(-1,1)
    
    feat_5_no = np.copy(X[:, 5])
    feat_5_no[feat_5_no == 1] = feat_5_no[feat_5_no == 1]*probs[5][1]
    feat_5_no[feat_5_no == 0] = 1
    feat_5_no = feat_5_no.reshape(-1,1)
    
    feat_6_yes = np.copy(X[:, 6])
    feat_6_yes[feat_6_yes == 1] = feat_6_yes[feat_6_yes == 1]*probs[6][0]
    feat_6_yes[feat_6_yes == 0] = 1
    feat_6_yes = feat_6_yes.reshape(-1,1)
    
    feat_6_no = np.copy(X[:, 6])
    feat_6_no[feat_6_no == 1] = feat_6_no[feat_6_no == 1]*probs[6][1]
    feat_6_no[feat_6_no == 0] = 1
    feat_6_no = feat_6_no.reshape(-1,1)
    
    
    #Creando matrices de probabilidades
    mat_yes = np.hstack( (feat_0_yes,
                         feat_1_yes,
                         feat_2_yes,
                         feat_3_yes,
                         feat_4_yes,
                         feat_5_yes,
                         feat_6_yes,
                         np.ones_like(feat_0_yes)*probs["chance_one"]) )
    
    mat_no = np.hstack( (feat_0_no,
                         feat_1_no,
                         feat_2_no,
                         feat_3_no,
                         feat_4_no,
                         feat_5_no,
                         feat_6_no,
                         np.ones_like(feat_0_yes)*probs["chance_zero"]) )

    #Calculando probabilidades ingenuas
    yes_probs = np.prod(mat_yes, axis=1).reshape(-1,1)
    
    no_probs = np.prod(mat_no, axis=1).reshape(-1,1)
    
    classwise_probs = np.hstack((no_probs, yes_probs))
    
    #Realizando predicciones
    preds = np.argmax(classwise_probs, axis=1)
    
    
    return preds

In [65]:
accuracy_score(y_train_m3, naive_predict(X_train_m3, probs))

0.7820738137082601

In [66]:
name = "naivebayes_"
naive_preds = naive_predict(X_train_m3, probs)
train_acc = accuracy_score(y_train_m3, naive_preds)
val_acc = accuracy_score(y_val, naive_predict(X_val, probs))
nbr = [name, train_acc, val_acc, *precision_recall_fscore_support(y_train_m3, naive_preds, average="binary")[0:3]]
nbr

['naivebayes_',
 0.7820738137082601,
 0.7412587412587412,
 0.9416666666666667,
 0.49130434782608695,
 0.6457142857142857]

## Modelo 4 - Regresion logistica

In [30]:
w = None

In [31]:
def h(X):
    global w
    return tf.nn.sigmoid(tf.matmul(X, w), name="sigmoid_calc")

In [32]:
def cross_entropy(X, y, isL2 = True, alpha = 0.5):
    global w
    logits = tf.matmul(X, w, name="logits_calculation")
    y = tf.reshape(y, (-1,1))
    
    err = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits), name="error_calc")
    
    reg = alpha*tf.math.reduce_sum(tf.math.square(w), name="l2_reg") if isL2 else alpha*tf.math.reduce_sum(tf.math.abs(w), name="l1_reg")
    
    return err + reg

In [149]:
@tf.function
def log_reg(X, y, isL2 = True, alpha = 0.5, epochs = 1000, lr = 0.001, batch_size = 32):
    global w
    if w is None:
        w = tf.Variable(tf.ones(shape=(X.shape[1]+1, 1), dtype=tf.dtypes.double), name="weights")
        
    X = tf.concat([tf.ones(shape=(X.shape[0], 1), dtype=X.dtype), X], axis=1)
        
    iterations = int(y.shape[0]/batch_size)
    step = 0
    for epoch in range(epochs):
        for i in range(iterations): 
            with tf.name_scope("batch_creation"):
                batch_start = i*batch_size
                batch_end = batch_start + batch_size
                X_b = X[batch_start:batch_end]
                y_b = y[batch_start:batch_end]
                
            y_probs = h(X_b)
                
            with tf.name_scope("gradient_calc"):
                with tf.GradientTape() as grad_tape:
                    grad_tape.watch(w)
                    error = cross_entropy(X_b, y_b, isL2, alpha)
                    
            with tf.name_scope("parameter_updating"):
                grads_w = grad_tape.gradient(error,w)
                w.assign_sub(grads_w*lr)
            
            step += 1
            tf.summary.scalar("Error", error, step)
    return w

Creando grafo

In [150]:
#Creando grafo
logdir = f'logs\\logreg_isL2={True}_alpha={0.5}_epochs=1_lr=0.001_batch_size={y_train_m4.shape[0]}'
writer = tf.summary.create_file_writer(logdir)

w = None
with writer.as_default():
    tf.summary.trace_on(graph=True, profiler=False)
    final_w = log_reg(X_train_m4, y_train_m4.astype(float), epochs = 1, batch_size = y_train_m4.shape[0])
    tf.summary.trace_export(
                          name="model_graph",
                          step=0,
                          profiler_outdir="logs\\modelgraph")
writer.flush()

final_w

Instructions for updating:
use `tf.profiler.experimental.stop` instead.


<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[0.9985155 ],
       [0.99890204],
       [0.9989711 ],
       [0.99888158],
       [0.99857457],
       [0.9986469 ],
       [0.99894097],
       [0.99892764]])>

![alt text](Graph.PNG "Grafo")

In [177]:
tf.config.run_functions_eagerly(True)

isL2 = [True, False]
alpha = [0, 0.1, 1]
epochs = [100,500]
lr = [0.1, 0.000001, 0.01]
batch_size = [32, 64]

reglog_runs = []

for isL2, alpha, epochs, lr, batch_size in list(itertools.product(isL2, alpha, epochs, lr, batch_size)):
    name = f'logreg_isL2={isL2}_alpha={alpha}_epochs={epochs}_lr={lr}_batch_size={batch_size}'
    writer = tf.summary.create_file_writer("logs\\" + name)
    w = None
    with writer.as_default():
        final_w = log_reg(X_train_m4, y_train_m4.astype(float), isL2, alpha, epochs, lr, batch_size)
        preds_train = tf.math.round(h( np.hstack( (np.ones(shape = (X_train_m4.shape[0], 1)), X_train_m4 ) )))
        preds_val = tf.math.round(h( np.hstack( (np.ones(shape = (X_val.shape[0], 1)), X_val) ) ))
        train_acc = accuracy_score(y_train_m4, preds_train)
        val_acc = accuracy_score(y_val, preds_val)
        reglog_runs.append([name, train_acc, val_acc, *precision_recall_fscore_support(y_train_m4, preds_train, average="binary")[0:3]])
    writer.flush()
    print(final_w)

<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[ 0.44011291],
       [-0.73659197],
       [-0.24145794],
       [-0.27267565],
       [-2.47473452],
       [-0.58600024],
       [ 0.93483337],
       [ 2.09127977]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[ 0.31857896],
       [-0.7090746 ],
       [-0.21702997],
       [-0.19375875],
       [-2.14039522],
       [-0.53816851],
       [ 0.88117156],
       [ 1.97557591]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[0.99917968],
       [0.99983589],
       [0.99995523],
       [0.99980289],
       [0.99928064],
       [0.99940244],
       [0.99989914],
       [0.9998781 ]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[0.9996232 ],
       [0.99992834],
       [0.99998651],
       [0.99993317],
       [0.99966654],
       [0.9997243 ],
       [0.99995348],
       [0.99994542]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
arra

<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[-0.03567735],
       [-0.02535973],
       [-0.00275097],
       [-0.00404254],
       [-0.07447545],
       [-0.06305674],
       [ 0.0062664 ],
       [ 0.02111299]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[ 0.44011291],
       [-0.73659197],
       [-0.24145794],
       [-0.27267565],
       [-2.47473452],
       [-0.58600024],
       [ 0.93483337],
       [ 2.09127977]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[ 0.31857896],
       [-0.7090746 ],
       [-0.21702997],
       [-0.19375875],
       [-2.14039522],
       [-0.53816851],
       [ 0.88117156],
       [ 1.97557591]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[0.99917968],
       [0.99983589],
       [0.99995523],
       [0.99980289],
       [0.99928064],
       [0.99940244],
       [0.99989914],
       [0.9998781 ]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, num

<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[0.99411774],
       [0.99564134],
       [0.99593211],
       [0.99566547],
       [0.99433384],
       [0.99462254],
       [0.99576771],
       [0.99572749]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[-0.00161898],
       [-0.01007603],
       [ 0.00505601],
       [ 0.00696709],
       [ 0.00713036],
       [-0.00112895],
       [-0.0024574 ],
       [ 0.00196737]])>
<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[ 0.00543725],
       [-0.00162279],
       [-0.00068615],
       [ 0.00557035],
       [-0.00985983],
       [-0.00730491],
       [ 0.00090746],
       [-0.00816531]])>


In [178]:
reglog_runs

[['logreg_isL2=True_alpha=0_epochs=100_lr=0.1_batch_size=32',
  0.7820738137082601,
  0.7972027972027972,
  0.7333333333333333,
  0.6936936936936937,
  0.7129629629629628],
 ['logreg_isL2=True_alpha=0_epochs=100_lr=0.1_batch_size=64',
  0.8031634446397188,
  0.8041958041958042,
  0.7546296296296297,
  0.7342342342342343,
  0.7442922374429225],
 ['logreg_isL2=True_alpha=0_epochs=100_lr=1e-06_batch_size=32',
  0.37258347978910367,
  0.4125874125874126,
  0.37924865831842575,
  0.954954954954955,
  0.5428937259923174],
 ['logreg_isL2=True_alpha=0_epochs=100_lr=1e-06_batch_size=64',
  0.37258347978910367,
  0.4125874125874126,
  0.37924865831842575,
  0.954954954954955,
  0.5428937259923174],
 ['logreg_isL2=True_alpha=0_epochs=100_lr=0.01_batch_size=32',
  0.7926186291739895,
  0.7972027972027972,
  0.9126984126984127,
  0.5180180180180181,
  0.6609195402298851],
 ['logreg_isL2=True_alpha=0_epochs=100_lr=0.01_batch_size=64',
  0.7135325131810193,
  0.7202797202797203,
  0.672514619883041,


![alt text](error.PNG "Errores")

## Encontrando los mejores candidatos para cada tipo de modelo

In [179]:
runs = [*tree_runs, *svc_runs, nbr, *reglog_runs]
runs_dict = {}
for i, run in enumerate(runs):
    runs_dict[i] = [run[0][:run[0].index("_")],
                   run[0][run[0].index("_")+1:],
                   run[1],
                   run[2],
                   run[3],
                   run[4],
                   run[5]]
    
results = pd.DataFrame.from_dict(runs_dict).T.rename(columns={0:col_names[0],
                                                              1:col_names[1],
                                                              2:col_names[2],
                                                              3:col_names[3],
                                                              4:col_names[4],
                                                              5:col_names[5],
                                                              6:col_names[6]})
results

Unnamed: 0,name,model_params,train accuracy,val accuracy,precision,recall,f1 score
0,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=None,0.954306,0.636364,0.941176,0.949153,0.945148
1,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=auto,0.954306,0.664336,0.941176,0.949153,0.945148
2,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=sqrt,0.954306,0.664336,0.941176,0.949153,0.945148
3,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=log2,0.954306,0.664336,0.941176,0.949153,0.945148
4,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=1_maxfeatures=3,0.954306,0.65035,0.941176,0.949153,0.945148
5,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=10_maxfeatures=None,0.86819,0.72028,0.851528,0.826271,0.83871
6,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=10_maxfeatures=auto,0.804921,0.818182,0.861272,0.631356,0.728606
7,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=10_maxfeatures=sqrt,0.804921,0.818182,0.861272,0.631356,0.728606
8,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=10_maxfeatures=log2,0.804921,0.818182,0.861272,0.631356,0.728606
9,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=10_maxfeatures=3,0.84007,0.776224,0.860697,0.733051,0.791762


In [180]:
#Creando csv
results.to_csv("results.csv", index=False)

In [181]:
pd.options.display.max_rows = 427
results.sort_values(["name", "val accuracy"], ascending=[False, False])

Unnamed: 0,name,model_params,train accuracy,val accuracy,precision,recall,f1 score
280,svc,C=0.5_kernel=poly_shrinking=True,0.866432,0.804196,0.825137,0.774359,0.798942
281,svc,C=0.5_kernel=poly_shrinking=False,0.866432,0.804196,0.825137,0.774359,0.798942
288,svc,C=1_kernel=poly_shrinking=True,0.866432,0.804196,0.825137,0.774359,0.798942
289,svc,C=1_kernel=poly_shrinking=False,0.866432,0.804196,0.825137,0.774359,0.798942
296,svc,C=2_kernel=poly_shrinking=True,0.864675,0.804196,0.820652,0.774359,0.796834
297,svc,C=2_kernel=poly_shrinking=False,0.864675,0.804196,0.820652,0.774359,0.796834
304,svc,C=5_kernel=poly_shrinking=True,0.866432,0.804196,0.825137,0.774359,0.798942
305,svc,C=5_kernel=poly_shrinking=False,0.866432,0.804196,0.825137,0.774359,0.798942
312,svc,C=10_kernel=poly_shrinking=True,0.873462,0.797203,0.843575,0.774359,0.807487
313,svc,C=10_kernel=poly_shrinking=False,0.873462,0.797203,0.843575,0.774359,0.807487


In [183]:
# Estos son los mejores parametros para cada tipo de modelo segun la metrica de accuracy sobre el set de validacion
pd.options.display.max_colwidth = 100
results.loc[[280, 318, 320, 6]]

Unnamed: 0,name,model_params,train accuracy,val accuracy,precision,recall,f1 score
280,svc,C=0.5_kernel=poly_shrinking=True,0.866432,0.804196,0.825137,0.774359,0.798942
318,naivebayes,,0.782074,0.741259,0.941667,0.491304,0.645714
320,logreg,isL2=True_alpha=0_epochs=100_lr=0.1_batch_size=64,0.803163,0.804196,0.75463,0.734234,0.744292
6,decisiontree,crit=gini_maxdepth=None_minsamplessplit=2_minsamplesleaf=10_maxfeatures=auto,0.804921,0.818182,0.861272,0.631356,0.728606


# Probando modelos contra el set de test

In [184]:
svc = SVC(random_state = 27,
             C = 0.5,
             kernel = "poly",
             shrinking = True)
svc.fit(X_pre_train, y_pre_train)
accuracy_score(y_test, svc.predict(X_test))

0.8435754189944135

In [185]:
tree = DecisionTreeClassifier(random_state=21, 
                              criterion="gini", 
                              max_depth=None, 
                              min_samples_split=2, 
                              min_samples_leaf=10,
                              max_features="auto")
tree.fit(X_pre_train, y_pre_train)
accuracy_score(y_test, tree.predict(X_test))

0.8491620111731844

In [186]:
def nb_trainer(X, y):
    chance_one = np.where(y==1)[0].size/y.size

    num_ones = np.where(y==1)[0].size

    num_zeroes = np.where(y==0)[0].size

    chance_zero = np.where(y==0)[0].size/y.size

    #Definiendo un set con X y y para simplificar calculos
    naive_set = np.hstack((X, y.reshape(-1,1)))

    probs = {}

    #Variables continuas
    for i in range(0,3):
        mean_yes = np.mean(naive_set[np.where(naive_set[:,7] == 1),i])
        sd_yes = np.std(naive_set[np.where(naive_set[:,7] == 1),i])

        mean_no = np.mean(naive_set[np.where(naive_set[:,7] == 0),i])
        sd_no = np.std(naive_set[np.where(naive_set[:,7] == 0),i])

        probs[i] = ((mean_yes, sd_yes), (mean_no, sd_no))

    #Variables categoricas
    for i in range(3, 7):
        chance_yes = np.where((naive_set[:,i] == 1) & (naive_set[:,7] == 1))[0].size/num_ones
        chance_no = np.where((naive_set[:,i] == 1) & (naive_set[:,7] == 0))[0].size/num_zeroes
        probs[i] = (chance_yes, chance_no)

    # se añaden las prior probabilities al dict de probs
    probs["chance_one"] = chance_one
    probs["chance_zero"] = chance_zero
    
    return probs

In [187]:
probs = nb_trainer(X_pre_train, y_pre_train)
accuracy_score(y_test, naive_predict(X_test, probs))

0.7318435754189944

In [188]:
w = None
test_w = log_reg(X_pre_train, y_pre_train.astype(float), isL2=True, alpha=0, epochs=100, lr=0.1, batch_size=64)
accuracy_score(y_test, tf.math.round(h( np.hstack( (np.ones(shape = (X_test.shape[0], 1)), X_test) ) )))

0.7932960893854749

# Creando dataset de predicciones

In [189]:
svc_preds = svc.predict(X).reshape(-1,1)
tree_preds = tree.predict(X).reshape(-1,1)
naive_preds = naive_predict(X, probs).reshape(-1,1)
logreg_preds = tf.math.round(h( np.hstack( (np.ones(shape = (X.shape[0], 1)), X) ) )).numpy()
simple_preds = np.hstack((svc_preds,
                         tree_preds,
                         naive_preds,
                         logreg_preds))

bag_preds =  mode(simple_preds, axis=1).mode

all_preds = np.hstack((simple_preds, bag_preds))

all_preds_pd = pd.DataFrame(np.hstack((all_preds, y.reshape(-1,1)))).rename(columns={0:"SVC",
                                                                                     1:"Decision Tree",
                                                                                     2:"Naive Bayes",
                                                                                     3:"Logistic Regression",
                                                                                     4:"Ensemble",
                                                                                     5:"True y"})
all_preds_pd

Unnamed: 0,SVC,Decision Tree,Naive Bayes,Logistic Regression,Ensemble,True y
0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,0.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
886,0.0,0.0,0.0,0.0,0.0,0.0
887,1.0,1.0,1.0,1.0,1.0,1.0
888,0.0,0.0,0.0,0.0,0.0,0.0
889,0.0,0.0,1.0,1.0,0.0,1.0


In [190]:
bag_metrics =  pd.DataFrame([accuracy_score(y, bag_preds), *precision_recall_fscore_support(y, bag_preds, average="binary")[0:3]]).T
bag_metrics.rename(columns = {0:"accuracy",
                             1:"precision",
                             2:"recall",
                             3:"f1 score"})

Unnamed: 0,accuracy,precision,recall,f1 score
0,0.821549,0.840149,0.660819,0.739771


# Entrenando modelos finales

In [192]:
svc = SVC(random_state = 27,
             C = 0.5,
             kernel = "poly",
             shrinking = True)
svc.fit(X, y)

SVC(C=0.5, kernel='poly', random_state=27)

In [193]:
tree = DecisionTreeClassifier(random_state=21, 
                              criterion="gini", 
                              max_depth=None, 
                              min_samples_split=2, 
                              min_samples_leaf=10,
                              max_features="auto")
tree.fit(X, y)

DecisionTreeClassifier(max_features='auto', min_samples_leaf=10,
                       random_state=21)

In [194]:
probs = nb_trainer(X, y)
probs

{0: ((-0.08526655108802476, 1.057338157967127),
  (0.053116867890900636, 0.9587302790672132)),
 1: ((-0.044753245862730474, 0.642078739199201),
  (0.02787907119317641, 1.1679490620740025)),
 2: ((0.10342362630270571, 0.9565273107089884),
  (-0.06442783277873461, 1.02086781226304)),
 3: (0.31871345029239767, 0.8524590163934426),
 4: (0.347953216374269, 0.6775956284153005),
 5: (0.2543859649122807, 0.1766848816029144),
 6: (0.39766081871345027, 0.14571948998178508),
 'chance_one': 0.3838383838383838,
 'chance_zero': 0.6161616161616161}

In [195]:
w = None
final_w = log_reg(X, y.astype(float), isL2=True, alpha=0, epochs=100, lr=0.1, batch_size=64)
final_w

<tf.Variable 'weights:0' shape=(8, 1) dtype=float64, numpy=
array([[ 0.55262765],
       [-0.49399979],
       [-0.343242  ],
       [-0.04680365],
       [-2.60229369],
       [-0.25892477],
       [ 0.89438302],
       [ 1.9171694 ]])>

# Conclusiones
1. Al usar un ensemble podemos cubrir las fortalezas y debilidades de un modelo singular, ayudando a mejorar scores como accuracy, precision y recall de tal manera que, a pesar de que potencialmente el score del ensemble sea menor que un modelo particular, las predicciones terminan siendo menos biased en general
1. Bagging es una estrategia para estimar un parametro de una poblacion utilizando submuestras de una muestra de la poblacion, ayudandonos a obtener una estimacion mas adecuada a la real. En cuanto a ML, tambien nos ayuda a poder entrenar diferentes modelos puesto que la logica sigue siendo la misma, solo que en lugar de estimar un parametro de una poblacion, se trata de realizar una prediccion, pero en terminos funcionales la logica es la misma.
1. Deployment de los modelos finales es un paso sumamente importante ya que de nada sirve un modelo si nadie lo usa, y nadie puede usar un modelo que no ha sido habilitado o desplegado

# Exportando los modelos

In [196]:
# En la realidad seria prudente definir una funcion de preprocesamiento que permita reprocesar data nueva de la misma
# y exacta manera en la que se proceso la data de entrenamiento, pero para fines practicos en este ejercicio se exportara
# "X" y "y" ya procesados
joblib.dump(X, "X.joblib")
joblib.dump(y, "y.joblib")

joblib.dump(svc, "svc.joblib")
joblib.dump(tree, "tree.joblib")
joblib.dump(final_w.numpy(), "w.joblib")
with open("probs.json", "w") as json_file:
    json.dump(probs, json_file)    

# Investigaciones

## Exportar y cargar modelos de Tensorflow

A nivel bajo (i.e. TensorFlow puro, sin Keras o alguna otra API) TensorFlow cuenta con un sistema para poder guardar (y compartir) modelos, permitiendo almacenar (y cargar) le enteridad del modelo incluyendo todos sus parametros calculados sin necesidad de tener que acceder o ejecuta el codigo original que lo creo. Esto se hace a traves del formato "Saved Model" incluido con TensorFlow.

(https://www.tensorflow.org/guide/saved_model)

Por otra parte, TensorFlow tambien cuenta con una herramienta que permite almacenar y cargar los pesos de un modelo (especificamente, las tf.Variable que maneja el modelo), de tal manera que unicamente representan un Tensor al ser cargados y no sirven para mucho a menos que se tenga el codigo fuente que utilice estos pesos para realizar calculos de predicciones. Estos son conocidos como Checkpoints.

En el caso de los Saved Models, guardar un modelo es tan sencillo como ejecutar la siguiente linea de codigo:

tf.saved_model.save(model, path_to_dir)

Mientras que para cargar modelos se puede utilizar esta otra linea:

model = tf.saved_model.load(path_to_dir)

El uso de un checkpoint es un poco mas complicado ya que puede guardar una serie de atributos, la o las tf.Variable y los elementos de las cuales dependen dichas tf.Variable. Ademas, existen dos maneras de almacenar, y de cargar, los checkpoints.

Las tf.Variable pueden ser almacenadas usando tf.Checkpoint.write(file_prefix, options) o bien tf.Checkpoint.save(file_prefix, options).

La diferencia entre ambos es que el metodo save() guarda, ademas de las variables, contadores de checkpoint y almacena metadata, mientras que write() no almacena esta informacion.

Para cargar la informacion almacenada nuevamente, se puede utilizar tf.Checkpoint.read(save_path, options) para cargar datos creados con write(), o tf.Checkpoint.restore(save_path, options) para cargar datos creados con save()

## K-fold cross validation

Esta metodologia consiste en tomar un dataset de entrenamiento y separarlos en k distintas muestras. Para cada una de estas muestras, $k_{i}$, se entrena el modelo usando las k-1 muestras restantes y se utiliza $k_{i}$ como set de validacion.

Esta tecnica puede utilizarse para determinar los pesos de un modelo, promediando los resultados de que el entrenamiento da con cada fold.

Sin embargo, esta tecnica tambien tiene sus complicaciones. En primer lugar, se debe seleccionar un valor de k que sea congruente con el tamaño del dataset de entrenamiento, por ejemplo si existen 10000 observaciones un k de 10 podria ser bastante bueno puesto que cada modelo se entrenara con 9000 ejemplos, pero si solo existen 100 observaciones, un k de 4 significa que solamente se entrenara un modelo con 75 ejemplos, lo cual tiene un mayor riesgo de no estar balanceado o de no representar a la poblacion. Por otra parte, k-fold cross validations implica que se entrenaran k modelos en base a la data, lo cual puede ser prohibitivamente costoso cuando el tiempo es critico y el proceso de entrenamiento del modelo seleccionado es largo o tardado.