In [None]:
# Import libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix, auc
from scipy.stats import uniform, randint
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load the data cleaned in the previous step
all_data_df = pd.read_csv("C:/Users/juane/OneDrive/Escritorio/Datos/Kaggle_Titanic/cleaned_data.csv")

In [None]:
# Feature Engineering: Create new features
all_data_df['TotalSpending'] = all_data_df['RoomService'] + all_data_df['FoodCourt'] + all_data_df['ShoppingMall'] + all_data_df['Spa'] + all_data_df['VRDeck']
all_data_df['SpendingPerAge'] = all_data_df['TotalSpending'] / (all_data_df['Age'] + 1)  # +1 to avoid division by zero

In [None]:
# One-Hot Encoding for categorical variables
one_hot_encoder = OneHotEncoder(drop='first', sparse=False)
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
encoded_features = one_hot_encoder.fit_transform(all_data_df[categorical_columns])

In [None]:
# Define features and target variable
X = all_data_df.drop('Transported', axis=1)
y = all_data_df['Transported']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Crear el escalador
scaler = MinMaxScaler()

# Ajustar y transformar los datos de entrenamiento
X_train = scaler.fit_transform(X_train)

# Transformar los datos de prueba
X_test = scaler.transform(X_test)

In [None]:
# Initialize models
log_reg = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)
knn_clf = KNeighborsClassifier()
xgb_clf = XGBClassifier(random_state=42)

In [None]:
# Train models
log_reg.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
gb_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)
y_pred_gb = gb_clf.predict(X_test)
y_pred_svm = svm_clf.predict(X_test)
y_pred_knn = knn_clf.predict(X_test)
y_pred_xgb = xgb_clf.predict(X_test)

In [None]:
# Get predicted probabilities for ROC-AUC
y_pred_proba_log_reg = log_reg.predict_proba(X_test)[:, 1]
y_pred_proba_rf = rf_clf.predict_proba(X_test)[:, 1]
y_pred_proba_gb = gb_clf.predict_proba(X_test)[:, 1]
y_pred_proba_svm = svm_clf.predict_proba(X_test)[:, 1]
y_pred_proba_knn = knn_clf.predict_proba(X_test)[:, 1]
y_pred_proba_xgb = xgb_clf.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate models
def evaluate_model(y_test, y_pred, y_pred_proba):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    return accuracy, precision, recall, f1, roc_auc

log_reg_metrics = evaluate_model(y_test, y_pred_log_reg, y_pred_proba_log_reg)
rf_metrics = evaluate_model(y_test, y_pred_rf, y_pred_proba_rf)
gb_metrics = evaluate_model(y_test, y_pred_gb, y_pred_proba_gb)
svm_metrics = evaluate_model(y_test, y_pred_svm, y_pred_proba_svm)
knn_metrics = evaluate_model(y_test, y_pred_knn, y_pred_proba_knn)
xgb_metrics = evaluate_model(y_test, y_pred_xgb, y_pred_proba_xgb)

In [None]:
# Print evaluation metrics
print(f"Logistic Regression: Accuracy={log_reg_metrics[0]:.2f}, Precision={log_reg_metrics[1]:.2f}, Recall={log_reg_metrics[2]:.2f}, F1-Score={log_reg_metrics[3]:.2f}, ROC-AUC={log_reg_metrics[4]:.2f}")
print(f"Random Forest: Accuracy={rf_metrics[0]:.2f}, Precision={rf_metrics[1]:.2f}, Recall={rf_metrics[2]:.2f}, F1-Score={rf_metrics[3]:.2f}, ROC-AUC={rf_metrics[4]:.2f}")
print(f"Gradient Boosting: Accuracy={gb_metrics[0]:.2f}, Precision={gb_metrics[1]:.2f}, Recall={gb_metrics[2]:.2f}, F1-Score={gb_metrics[3]:.2f}, ROC-AUC={gb_metrics[4]:.2f}")
print(f"SVM: Accuracy={svm_metrics[0]:.2f}, Precision={svm_metrics[1]:.2f}, Recall={svm_metrics[2]:.2f}, F1-Score={svm_metrics[3]:.2f}, ROC-AUC={svm_metrics[4]:.2f}")
print(f"KNN: Accuracy={knn_metrics[0]:.2f}, Precision={knn_metrics[1]:.2f}, Recall={knn_metrics[2]:.2f}, F1-Score={knn_metrics[3]:.2f}, ROC-AUC={knn_metrics[4]:.2f}")
print(f"XGBoost: Accuracy={xgb_metrics[0]:.2f}, Precision={xgb_metrics[1]:.2f}, Recall={xgb_metrics[2]:.2f}, F1-Score={xgb_metrics[3]:.2f}, ROC-AUC={xgb_metrics[4]:.2f}")

In [None]:
# Plot ROC Curve for the best model - Gradient Boosting
best_model = gb_clf
y_pred_proba_best = y_pred_proba_gb

fpr, tpr, _ = roc_curve(y_test, y_pred_proba_best)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# Plot Precision-Recall Curve for the best model
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba_best)
pr_auc = auc(recall, precision)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label=f'Precision-Recall curve (area = {pr_auc:.2f})')
plt.xlabel('Recall')
plt

In [None]:
# Function to plot the confusion matrix
def plot_confusion_matrix(y_test, y_pred, title):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Generate and plot the confusion matrix for the Gradient Boosting model
plot_confusion_matrix(y_test, y_pred_gb, 'Gradient Boosting')

In [None]:
# Generate the confusion matrix for the Gradient Boosting model
cm_gb = confusion_matrix(y_test, y_pred_gb)

# Print the confusion matrix
print("Confusion Matrix for Gradient Boosting:")
print(cm_gb)