In [36]:
pip install numpy pandas matplotlib seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split


# Dataset

In [38]:
df  = pd.read_csv('data_titanic_proyecto.csv')
df.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex,passenger_survived
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,Lower,M,N
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,Upper,F,Y
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,Lower,F,Y
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,Upper,F,Y
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,Lower,M,N


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PassengerId         891 non-null    int64  
 1   Name                891 non-null    object 
 2   Age                 714 non-null    float64
 3   SibSp               891 non-null    int64  
 4   Parch               891 non-null    int64  
 5   Ticket              891 non-null    object 
 6   Fare                891 non-null    float64
 7   Cabin               204 non-null    object 
 8   Embarked            889 non-null    object 
 9   passenger_class     891 non-null    object 
 10  passenger_sex       891 non-null    object 
 11  passenger_survived  891 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 83.7+ KB


In [40]:
df.describe()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,446.0,29.699118,0.523008,0.381594,32.204208
std,257.353842,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,223.5,20.125,0.0,0.0,7.9104
50%,446.0,28.0,0.0,0.0,14.4542
75%,668.5,38.0,1.0,0.0,31.0
max,891.0,80.0,8.0,6.0,512.3292


In [41]:
df[df["Age"] < 1] = df[df["Age"] < 1] * 10

In [42]:

# Alternativa usando str.extract en lugar de apply + función
df['TicketPrefix_alt'] = df['Ticket'].str.extract(r'([A-Za-z./]+)', expand=False).str.strip().fillna("NoPrefix")
df['TicketPrefix_alt'] = df['TicketPrefix_alt'].str.replace(r'\W+', '_', regex=True).str.strip('_')
df.drop(columns=['Ticket'], inplace=True)

In [43]:
for col, val in df.isna().sum().items():
    if val > 0:
        print(f"Columna {col} tiene {val} valores faltantes. ({val/len(df)*100:.2f}%)")

Columna Age tiene 177 valores faltantes. (19.87%)
Columna Cabin tiene 687 valores faltantes. (77.10%)
Columna Embarked tiene 2 valores faltantes. (0.22%)


In [44]:
from sklearn.impute import KNNImputer

cols_for_knn = [col for col in df.columns if col != 'Cabin']

numeric_cols = df[cols_for_knn].select_dtypes(include=[np.number]).columns

imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

df = df.drop(columns=['Cabin', 'Name'])
df = df.dropna(axis=1)

In [45]:
df.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,passenger_class,passenger_sex,passenger_survived,TicketPrefix_alt
0,1.0,22.0,1.0,0.0,7.25,Lower,M,N,A
1,2.0,38.0,1.0,0.0,71.2833,Upper,F,Y,PC
2,3.0,26.0,0.0,0.0,7.925,Lower,F,Y,STON_O
3,4.0,35.0,1.0,0.0,53.1,Upper,F,Y,NoPrefix
4,5.0,35.0,0.0,0.0,8.05,Lower,M,N,NoPrefix


In [46]:
df.describe()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0
mean,485.313131,29.782604,0.593715,0.472503,34.838888
std,569.224167,13.204739,1.536421,1.420586,71.372436
min,1.0,1.0,0.0,0.0,0.0
25%,224.5,21.7,0.0,0.0,7.9104
50%,448.0,29.0,0.0,0.0,14.4542
75%,672.5,36.55,1.0,0.0,31.275
max,8320.0,80.0,20.0,20.0,1515.5


In [47]:
for col, val in df.isna().sum().items():
    if val > 0:
        print(f"Columna {col} tiene {val} valores faltantes. ({val/len(df)*100:.2f}%)")

In [48]:
df.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,passenger_class,passenger_sex,passenger_survived,TicketPrefix_alt
0,1.0,22.0,1.0,0.0,7.25,Lower,M,N,A
1,2.0,38.0,1.0,0.0,71.2833,Upper,F,Y,PC
2,3.0,26.0,0.0,0.0,7.925,Lower,F,Y,STON_O
3,4.0,35.0,1.0,0.0,53.1,Upper,F,Y,NoPrefix
4,5.0,35.0,0.0,0.0,8.05,Lower,M,N,NoPrefix


In [49]:
col_discretas = []
col_categoricas = []
col_continuas = []
for col in df.columns:
    if(df[col].dtype == 'object') or (df[col].dtype == 'category'):
        col_categoricas.append(col)
    else:
        if (df[col].nunique() < 20):
            col_discretas.append(col)
        else:
            col_continuas.append(col)

col_continuas.remove("PassengerId")
col_categoricas.remove("passenger_survived")

In [50]:
df["passenger_sex"] = df["passenger_sex"].str[0]

In [51]:
df.loc[df['passenger_class'].str.contains("Lower", case=False, na=False), 'passenger_class'] = "Lower"
df.loc[df['passenger_class'].str.contains("Middle", case=False, na=False), 'passenger_class'] = "Middle"
df.loc[df['passenger_class'].str.contains("Upper", case=False, na=False), 'passenger_class'] = "Upper"

In [52]:
df['passenger_survived'] = df['passenger_survived'].astype(str).str.strip().str[0].str.upper()
df['passenger_survived'] = df['passenger_survived'].replace({'Y': 1, 'N': 0})

  df['passenger_survived'] = df['passenger_survived'].replace({'Y': 1, 'N': 0})


In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[col_continuas] = scaler.fit_transform(df[col_continuas])

In [54]:
df_encoded = pd.get_dummies(df, columns=col_categoricas, drop_first=False)

In [55]:
df_encoded.drop(columns=['PassengerId'], inplace=True)

In [56]:
df_encoded.head()   

Unnamed: 0,Age,SibSp,Parch,Fare,passenger_survived,passenger_class_Lower,passenger_class_Middle,passenger_class_Upper,passenger_sex_F,passenger_sex_M,...,TicketPrefix_alt_S_C_A,TicketPrefix_alt_S_C_PARIS,TicketPrefix_alt_S_O_C,TicketPrefix_alt_S_O_P,TicketPrefix_alt_S_O_P_P,TicketPrefix_alt_S_P,TicketPrefix_alt_S_W_PP,TicketPrefix_alt_WE_P,TicketPrefix_alt_W_C,TicketPrefix_alt_W_E_P
0,0.265823,1.0,0.0,0.004784,0,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,0.468354,1.0,0.0,0.047036,1,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,0.316456,0.0,0.0,0.005229,1,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,0.43038,1.0,0.0,0.035038,1,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
4,0.43038,0.0,0.0,0.005312,0,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [57]:
df_encoded.columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'passenger_survived',
       'passenger_class_Lower', 'passenger_class_Middle',
       'passenger_class_Upper', 'passenger_sex_F', 'passenger_sex_M',
       'TicketPrefix_alt_A', 'TicketPrefix_alt_A_S', 'TicketPrefix_alt_C',
       'TicketPrefix_alt_CA', 'TicketPrefix_alt_C_A',
       'TicketPrefix_alt_C_A_SOTON', 'TicketPrefix_alt_F_C',
       'TicketPrefix_alt_F_C_C', 'TicketPrefix_alt_Fa',
       'TicketPrefix_alt_LINE', 'TicketPrefix_alt_NoPrefix',
       'TicketPrefix_alt_PC', 'TicketPrefix_alt_PP', 'TicketPrefix_alt_P_PP',
       'TicketPrefix_alt_SC', 'TicketPrefix_alt_SCO_W',
       'TicketPrefix_alt_SC_AH', 'TicketPrefix_alt_SC_PARIS',
       'TicketPrefix_alt_SC_Paris', 'TicketPrefix_alt_SOTON_O',
       'TicketPrefix_alt_SOTON_OQ', 'TicketPrefix_alt_SOTON_O_Q',
       'TicketPrefix_alt_SO_C', 'TicketPrefix_alt_STON_O',
       'TicketPrefix_alt_SW_PP', 'TicketPrefix_alt_S_C_A',
       'TicketPrefix_alt_S_C_PARIS', 'TicketPrefix_alt_S_O_

# SPLIT

In [58]:
df_train, df_temp = train_test_split(df_encoded, test_size=0.4, random_state=42)
df_validate, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

df_test.to_csv('test.csv', index=False)

print(f"Train shape: {df_train.shape}")
print(f"Validate shape: {df_validate.shape}")
print(f"Test shape: {df_test.shape}")

Train shape: (534, 45)
Validate shape: (178, 45)
Test shape: (179, 45)


# Funciones

## Arbol de Desicion

In [59]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

def train_decision_tree(X_train, y_train, X_test, y_test, max_depth, experiment_log="bitacora.csv"):

    model = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)

    
    experiment_name = f"decisiontree_{max_depth}"
    
    joblib.dump(model, f"{experiment_name}.pkl")

    print("\nValidation Results:")
    print(f"Model: {experiment_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(cm)
    
    result = {
        "model": "Decision Tree",
        "model_file": f"{experiment_name}.pkl",
        "variables": {
            "max_depth": max_depth
        },
        "accuracy": accuracy,
        "precision": report_dict["weighted avg"]["precision"],
        "recall": report_dict["weighted avg"]["recall"],
        "f1_score": report_dict["weighted avg"]["f1-score"]
    }
    
    df_result = pd.DataFrame([result])
    if os.path.exists(experiment_log):
        df_result.to_csv(experiment_log, mode='a', header=False, index=False)
    else:
        df_result.to_csv(experiment_log, index=False)
    
    return model


## SMV

In [60]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os


def train_svm(X_train, y_train, X_test, y_test, C=1.0, kernel='rbf', gamma='scale', experiment_log="bitacora.csv"):
    # Entrenamiento del modelo SVM
    model = SVC(C=C, kernel=kernel, gamma=gamma, probability=True, random_state=42)
    model.fit(X_train, y_train)

    # Predicción
    y_pred = model.predict(X_test)

    # Métricas
    accuracy = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)

    # Mostrar resultados

    # Nombre del experimento
    experiment_name = f"svm_C={C}_kernel={kernel}_gamma={gamma}"

    # Guardar modelo
    joblib.dump(model, f"{experiment_name}.pkl")

    print("\nValidation Results:")
    print(f"Model: {experiment_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(cm)

    # Guardar en bitácora
    result = {
        "model": "SVM",
        "model_file": f"{experiment_name}.pkl",
        "variables": {
            "C": C,
            "kernel": kernel,
            "gamma": gamma
        },
        "accuracy": accuracy,
        "precision": report_dict["weighted avg"]["precision"],
        "recall": report_dict["weighted avg"]["recall"],
        "f1_score": report_dict["weighted avg"]["f1-score"]
    }
    

    df_result = pd.DataFrame([result])
    if os.path.exists(experiment_log):
        df_result.to_csv(experiment_log, mode='a', header=False, index=False)
    else:
        df_result.to_csv(experiment_log, index=False)

    return model


## Bayes

In [61]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os

def train_naive_bayes_vectorized(X_train, y_train, X_test, y_test, laplace=1, experiment_log="bitacora.csv"):
    # Combinar X e y
    df_train = X_train.copy()
    df_train['target'] = y_train
    
    clases = y_train.unique()
    features = X_train.columns

    # Calcular probabilidades a priori
    class_counts = df_train['target'].value_counts()
    priors = (class_counts / len(df_train)).to_dict()

    # Probabilidades condicionales (likelihoods) con Laplace smoothing
    likelihoods = {}
    for feature in features:
        cond_counts = df_train.groupby([feature, 'target']).size().unstack(fill_value=0)
        total_per_class = class_counts + laplace * len(cond_counts.index)
        cond_probs = (cond_counts + laplace).div(total_per_class, axis=1)
        likelihoods[feature] = cond_probs

    # Función de predicción vectorizada
    def predict_vectorized(X):
        log_priors = {cls: np.log(p) for cls, p in priors.items()}
        predictions = []

        for _, row in X.iterrows():
            log_posteriors = {}
            for cls in clases:
                log_posterior = log_priors[cls]
                for feature in features:
                    val = row[feature]
                    prob = likelihoods[feature].get(cls).get(val, laplace / (class_counts[cls] + laplace * len(likelihoods[feature])))
                    log_posterior += np.log(prob)
                log_posteriors[cls] = log_posterior
            predictions.append(max(log_posteriors, key=log_posteriors.get))
        return pd.Series(predictions)

    # Predicción
    y_pred = predict_vectorized(X_test)

    # Métricas
    accuracy = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)

    # Guardar modelo
    model_name = f"naivebayesV_laplace={laplace}"
    combined = []
    for feature in features:
        df = likelihoods[feature].copy()
        df.index.name = 'feature_value'
        df.reset_index(inplace=True)
        df['feature'] = feature
        combined.append(df)

    likelihoods_df = pd.concat(combined, ignore_index=True)
    likelihoods_df.to_csv(f"{model_name}_likelihoods.csv", index=False)

    print("\nValidation Results:")
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(cm)

    # Guardar bitácora
    result = {
        "model": "Naive Bayes Vectorized",
        "model_file": f"{model_name}",
        "variables": {
            "laplace": laplace
        },
        "accuracy": accuracy,
        "precision": report_dict["weighted avg"]["precision"],
        "recall": report_dict["weighted avg"]["recall"],
        "f1_score": report_dict["weighted avg"]["f1-score"]
    }
    df_result = pd.DataFrame([result])
    if os.path.exists(experiment_log):
        df_result.to_csv(experiment_log, mode='a', header=False, index=False)
    else:
        df_result.to_csv(experiment_log, index=False)

    return model_name


In [62]:
def load_naive_bayes_model(model_name):
    df = pd.read_csv(f"{model_name}_likelihoods.csv")
    
    clases = [col for col in df.columns if col not in ['feature', 'feature_value']]
    
    likelihoods = {}
    for feature, group in df.groupby('feature'):
        sub_df = group.set_index('feature_value')[clases]
        likelihoods[feature] = sub_df

    primer_feature = list(likelihoods.keys())[0]
    priors = likelihoods[primer_feature].sum().to_dict()
    total = sum(priors.values())
    priors = {k: v / total for k, v in priors.items()}

    return priors, likelihoods

def predict_naive_bayes(X, priors, likelihoods, laplace=1):
    clases = list(priors.keys())
    features = X.columns
    predictions = []

    for _, row in X.iterrows():
        log_posteriors = {}
        for cls in clases:
            log_prob = np.log(priors[cls])
            for feature in features:
                val = row[feature]
                tabla = likelihoods[feature]
                if val in tabla.index and cls in tabla.columns:
                    prob = tabla.at[val, cls]
                else:
                    # smoothing si el valor no fue visto
                    prob = laplace / (laplace * len(tabla.index))  # Denominador estimado
                log_prob += np.log(prob)
            log_posteriors[cls] = log_prob
        predictions.append(max(log_posteriors, key=log_posteriors.get))
    return pd.Series(predictions)


## REG. LOG.

In [63]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os

def sigmoid(z):
    z = np.asarray(z, dtype=np.float64)  # Asegura compatibilidad con np.exp
    return 1 / (1 + np.exp(-z))

def compute_loss_and_gradients(X, y, w, reg_type, lambda_):

    X = np.asarray(X, dtype=np.float64)
    y = np.asarray(y, dtype=np.float64).reshape(-1, 1)
    w = np.asarray(w, dtype=np.float64)

    m = len(y)
    h = sigmoid(X @ w)
    error = h - y

    if reg_type == 'l2':
        reg_term = lambda_ * np.sum(w[1:] ** 2) / (2 * m)
        loss = -np.mean(y * np.log(h + 1e-9) + (1 - y) * np.log(1 - h + 1e-9)) + reg_term
        grad = (X.T @ error) / m
        grad[1:] += (lambda_ / m) * w[1:]

    elif reg_type == 'l1':
        reg_term = lambda_ * np.sum(np.abs(w[1:])) / m
        loss = -np.mean(y * np.log(h + 1e-9) + (1 - y) * np.log(1 - h + 1e-9)) + reg_term
        grad = (X.T @ error) / m
        grad[1:] += (lambda_ / m) * np.sign(w[1:])

    else:  # sin regularización
        loss = -np.mean(y * np.log(h + 1e-9) + (1 - y) * np.log(1 - h + 1e-9))
        grad = (X.T @ error) / m

    return loss, grad

def train_logistic_regression_numpy(X_train, y_train, X_val, y_val,
                                     lr=0.01, lambda_=0.1, epochs=1000,
                                     batch_size=32, reg_type='l2',
                                     experiment_log='bitacora.csv'):
    # Preparar datos
    X_train = np.c_[np.ones(X_train.shape[0]), X_train]  # agregar bias
    X_val = np.c_[np.ones(X_val.shape[0]), X_val]
    y_train = y_train.values.reshape(-1, 1)
    y_val = y_val.values.reshape(-1, 1)

    n_features = X_train.shape[1]
    w = np.zeros((n_features, 1))

    # Mini-batch training
    for epoch in range(epochs):
        indices = np.random.permutation(X_train.shape[0])
        X_train = X_train[indices]
        y_train = y_train[indices]

        for i in range(0, X_train.shape[0], batch_size):
            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]
            loss, grad = compute_loss_and_gradients(X_batch, y_batch, w, reg_type, lambda_)
            w -= lr * grad

    # Validación
    y_val_pred = sigmoid(X_val @ w) >= 0.5
    y_val_pred = y_val_pred.astype(int)

    accuracy = accuracy_score(y_val, y_val_pred)
    report_dict = classification_report(y_val, y_val_pred, output_dict=True)
    cm = confusion_matrix(y_val, y_val_pred)



    # Guardar modelo
    reg_label = f"{reg_type}Reg" if reg_type in ['l1', 'l2'] else "NoReg"
    model_name = f"logreg_lr={lr}_lambda={lambda_}_batch={batch_size}_{reg_label}"
    np.save(f"{model_name}.npy", w)

    print("\nValidation Results:")
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_val, y_val_pred))
    print("Confusion Matrix:")
    print(cm)

    # Bitácora
    result = {
        "model": "Logistic Regression",
        "model_file": f"{model_name}.npy",
        "variables": {
            "lr": lr,
            "lambda": lambda_,
            "batch_size": batch_size,
            "reg_type": reg_type
        },
        "accuracy": accuracy,
        "precision": report_dict["weighted avg"]["precision"],
        "recall": report_dict["weighted avg"]["recall"],
        "f1_score": report_dict["weighted avg"]["f1-score"]
    }
    df_result = pd.DataFrame([result])
    if os.path.exists(experiment_log):
        df_result.to_csv(experiment_log, mode='a', header=False, index=False)
    else:
        df_result.to_csv(experiment_log, index=False)

    return w  # puedes retornar también model_name si quieres usarlo en inferencia


In [64]:
def predict_logistic_regression(X, w, threshold=0.5):
    X = np.asarray(X, dtype=np.float64)
    
    X = np.c_[np.ones(X.shape[0]), X]
    
    w = np.asarray(w, dtype=np.float64)
    probs = sigmoid(X @ w)
    return (probs >= threshold).astype(int)



# Entrenamientos

In [65]:
target = 'passenger_survived'
X_train = df_train.drop(columns=[target])
y_train = df_train[target]
X_val = df_validate.drop(columns=[target])
y_val = df_validate[target]

In [66]:
# Probar 10 configuraciones para cada modelo

# Árbol de Decisión
for max_depth in range(3, 13):
    train_decision_tree(X_train, y_train, X_val, y_val, max_depth, experiment_log="bitacora.csv")

# SVM
for C in [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100, 200]:
    train_svm(X_train, y_train, X_val, y_val, C=C, kernel='rbf', gamma='scale', experiment_log="bitacora.csv")

# Naive Bayes (variando laplace)
for laplace in range(1, 11):
    train_naive_bayes_vectorized(X_train, y_train, X_val, y_val, laplace=laplace, experiment_log="bitacora.csv")

# Regresión Logística (variando lambda)
for lambda_ in [0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10]:
    train_logistic_regression_numpy(X_train, y_train, X_val, y_val, lr=0.01, lambda_=lambda_, epochs=1000, batch_size=32, reg_type='l2', experiment_log="bitacora.csv")


Validation Results:
Model: decisiontree_3
Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       113
           1       0.72      0.63      0.67        65

    accuracy                           0.78       178
   macro avg       0.76      0.74      0.75       178
weighted avg       0.77      0.78      0.77       178

Confusion Matrix:
[[97 16]
 [24 41]]

Validation Results:
Model: decisiontree_4
Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       113
           1       0.73      0.63      0.68        65

    accuracy                           0.78       178
   macro avg       0.77      0.75      0.76       178
weighted avg       0.78      0.78      0.78       178

Confusion Matrix:
[[98 15]
 [24 41]]

Validation Results:
Model: decisiontree_5
Accuracy: 0.77
Classification Report:
              precision    re

# Ensemble Voting

In [67]:
x_test = df_test.drop(columns=[target])
y_test = df_test[target]

In [68]:
# Cargar la bitácora
bitacora = pd.read_csv('bitacora.csv')

# Encontrar el mejor modelo por técnica según accuracy
mejores = bitacora.loc[bitacora.groupby('model')['accuracy'].idxmax()]

print("Mejor modelo de cada técnica:")
print(mejores[['model', 'model_file', 'accuracy', 'precision', 'recall', 'f1_score']])

Mejor modelo de cada técnica:
                     model                                  model_file  \
1            Decision Tree                          decisiontree_4.pkl   
38     Logistic Regression  logreg_lr=0.01_lambda=5_batch=32_l2Reg.npy   
20  Naive Bayes Vectorized                       naivebayesV_laplace=1   
11                     SVM        svm_C=0.5_kernel=rbf_gamma=scale.pkl   

    accuracy  precision    recall  f1_score  
1   0.780899   0.777302  0.780899  0.776946  
38  0.758427   0.768052  0.758427  0.737482  
20  0.758427   0.763116  0.758427  0.760168  
11  0.786517   0.783464  0.786517  0.781044  


In [74]:
# Cargar la bitácora y encontrar el mejor modelo de cada técnica
bitacora = pd.read_csv('bitacora.csv')
mejores = bitacora.loc[bitacora.groupby('model')['accuracy'].idxmax()]

# Prepara los datos de test
X_test = df_test.drop(columns=['passenger_survived'])
y_test = df_test['passenger_survived']

# Diccionario para almacenar predicciones
predicciones = {}

# Árbol de Decisión
modelo_arbol = joblib.load(mejores.loc[mejores['model'] == 'Decision Tree', 'model_file'].values[0])
predicciones['arbol'] = modelo_arbol.predict(X_test)

# SVM
modelo_svm = joblib.load(mejores.loc[mejores['model'] == 'SVM', 'model_file'].values[0])
predicciones['svm'] = modelo_svm.predict(X_test)

# Naive Bayes
priors, likelihoods = load_naive_bayes_model(mejores.loc[mejores['model'] == 'Naive Bayes Vectorized', 'model_file'].values[0])
predicciones['bayes'] = predict_naive_bayes(X_test, priors, likelihoods, laplace=1)


# Regresión Logística
model_file = mejores.loc[mejores['model'] == 'Logistic Regression', 'model_file'].values[0]
w = np.load(f"{model_file}")

predicciones['log'] = predict_logistic_regression(X_test, w)

# Ensemble Voting (mayoría)
from scipy.stats import mode

for key in predicciones:
    predicciones[key] = np.array(predicciones[key]).flatten().astype(int)

y_ensemble = np.stack([
    predicciones['arbol'],
    predicciones['svm'],
    predicciones['bayes'],
    predicciones['log']
], axis=0)

from scipy.stats import mode
y_pred = mode(y_ensemble, axis=0).mode.flatten()

# Métricas
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Ensemble Test Results:\naccuracy: {acc}\nprecision: {prec}\nrecall: {rec}\nf1: {f1}')

Ensemble Test Results:
accuracy: 0.7932960893854749
precision: 0.9148936170212766
recall: 0.5657894736842105
f1: 0.6991869918699187


# K-fold

K-Fold Cross-Validation es una técnica para evaluar modelos dividiendo los datos en K partes (folds). El modelo se entrena K veces, usando un fold diferente como validación en cada iteración. Al final, se promedian los resultados para tener una evaluación más confiable. Tambien permitiendo multithreding.

Aplicación:
Se pudo usar para comparar mejor los modelos y reducir el sesgo de usar un solo conjunto de validación.