In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("titanic_cleaned.csv")

y = df['Survived']
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'PassengerId'])

X = pd.get_dummies(X, drop_first=True)
X = X.astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=5,
    min_samples_split=10,
    random_state=42
)

cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())


Cross-validation scores: [0.85314685 0.85314685 0.81690141 0.78169014 0.83802817]
Mean CV accuracy: 0.8285826849207132


In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)

grid = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)


Best Parameters: {'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 100}
Best CV Accuracy: 0.8299911356249385
