In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
# Eryk Elizondo González
%cd "/content/gdrive/MyDrive/ColabNotebooks/TC3006_7C_101_E5/Titanic/Datasets"

/content/gdrive/MyDrive/ColabNotebooks/TC3006_7C_101_E5/Titanic/Datasets


Fuente: https://www.kaggle.com/code/startupsci/titanic-data-science-solutions/notebook

In [3]:
# Importar librerias
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
# Función para limpiar los datos
def limpiar_datos(df):
    # Convertir 'Sex' a valores numéricos
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 0}).astype(int)

    # Rellenar edades NA con la mediana de 'Sex' y 'Pclass'
    df['Age'] = df['Age'].fillna(df.groupby(['Sex', 'Pclass'])['Age'].transform('median')).astype(int)

    # Clasificar edades en bandas
    df['Age'] = pd.cut(df['Age'], bins=[-1, 16, 32, 48, 64, 80], labels=False, include_lowest=True)

    # Crear características adicionales
    df['IsAlone'] = (df['SibSp'] + df['Parch'] == 0).astype(int)
    df['Age*Class'] = df['Age'] * df['Pclass']

    # Eliminar columnas innecesarias
    df.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1, inplace=True)

    return df

In [5]:
# Extraer y limpiar los datos
train_data = limpiar_datos(pd.read_csv('train.csv'))
test_data = limpiar_datos(pd.read_csv('test.csv'))

In [6]:
# Preparar conjuntos de entrenamiento y prueba
X_train = train_data.drop(["Survived", "PassengerId"], axis=1)
Y_train = train_data["Survived"]
X_test = test_data.drop("PassengerId", axis=1)

In [7]:
# Modelo Random Forest
random_forest = RandomForestClassifier(n_estimators=100).fit(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print("Accuracy Random Forest:", acc_random_forest)

Accuracy Random Forest: 81.82


In [8]:
# Modelo Decision Tree
decision_tree = DecisionTreeClassifier().fit(X_train, Y_train)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
print("Accuracy Decision Tree:", acc_decision_tree)

Accuracy Decision Tree: 81.82


In [9]:
# Predicciones y exportación para el modelo Random Forest
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": random_forest.predict(X_test)
})
submission.to_csv('Prediction_RandomForest.csv', index=False)