In [3]:
import pandas as pd

df = pd.read_csv("titanic_cleaned.csv")

y = df['Survived']
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'PassengerId'])

X = pd.get_dummies(X, drop_first=True)
X = X.astype(int)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
pred_tree = tree.predict(X_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, pred_tree))


Decision Tree Accuracy: 0.7877094972067039


In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42, n_estimators=200)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, pred_rf))


Random Forest Accuracy: 0.8268156424581006


In [7]:
print("Tree train acc:", tree.score(X_train, y_train))
print("Tree test acc:", tree.score(X_test, y_test))

print("RF train acc:", rf.score(X_train, y_train))
print("RF test acc:", rf.score(X_test, y_test))


Tree train acc: 0.9606741573033708
Tree test acc: 0.7877094972067039
RF train acc: 0.9606741573033708
RF test acc: 0.8268156424581006


In [8]:
rf_tuned = RandomForestClassifier(
    n_estimators=300,
    max_depth=5,
    min_samples_split=10,
    random_state=42
)

rf_tuned.fit(X_train, y_train)

print("Tuned RF Train acc:", rf_tuned.score(X_train, y_train))
print("Tuned RF Test acc:", rf_tuned.score(X_test, y_test))


Tuned RF Train acc: 0.8567415730337079
Tuned RF Test acc: 0.8044692737430168
