# Random Forest Prediction

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Train on the Original dataset

In [19]:
# Load original training data
df_orig = pd.read_csv("train.csv")

# Basic preprocessing
df_orig = df_orig.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df_orig['Age'] = df_orig['Age'].fillna(df_orig['Age'].median())
df_orig['Embarked'] = df_orig['Embarked'].fillna(df_orig['Embarked'].mode()[0])

# Encode categorical features
df_orig = pd.get_dummies(df_orig, columns=['Sex', 'Embarked'], drop_first=True)

# Split X and y
X_orig = df_orig.drop('Survived', axis=1)
y_orig = df_orig['Survived']

# Train/val split
X_train_orig, X_val_orig, y_train_orig, y_val_orig = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42)

# Train Random Forest
rf_orig = RandomForestClassifier(n_estimators=100, random_state=42)
rf_orig.fit(X_train_orig, y_train_orig)

# Predict and score
val_pred_orig = rf_orig.predict(X_val_orig)
acc_orig = accuracy_score(y_val_orig, val_pred_orig)
print("🌲 Random Forest on ORIGINAL data - Validation Accuracy:", acc_orig)

🌲 Random Forest on ORIGINAL data - Validation Accuracy: 0.8212290502793296


Give the prediction submission based on original dataset

In [20]:
# Load test data
test_orig = pd.read_csv("test.csv")
passenger_ids_orig = test_orig['PassengerId']

# Basic preprocessing
test_orig = test_orig.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_orig['Age'] = test_orig['Age'].fillna(test_orig['Age'].median())
test_orig['Fare'] = test_orig['Fare'].fillna(test_orig['Fare'].median())
test_orig['Embarked'] = test_orig['Embarked'].fillna(test_orig['Embarked'].mode()[0])

# Encode categorical
test_orig = pd.get_dummies(test_orig, columns=['Sex', 'Embarked'], drop_first=True)

# Align test columns to train
for col in X_orig.columns:
    if col not in test_orig.columns:
        test_orig[col] = 0
test_orig = test_orig[X_orig.columns]

# Predict and create submission
preds_orig = rf_orig.predict(test_orig)
submission_orig = pd.DataFrame({
    'PassengerId': passenger_ids_orig,
    'Survived': preds_orig
})
submission_orig.to_csv("submission/submission_rf_original.csv", index=False)
print("✅ submission_rf_original.csv saved.")


✅ submission_rf_original.csv saved.


### Train on Engineered Data

In [21]:
# Load cleaned data
df_clean = pd.read_csv("train_cleaned.csv")

# Split X and y
X_clean = df_clean.drop('Survived', axis=1)
y_clean = df_clean['Survived']

# Train/val split
X_train_clean, X_val_clean, y_train_clean, y_val_clean = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Train Random Forest
rf_clean = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clean.fit(X_train_clean, y_train_clean)

# Predict and score
val_pred_clean = rf_clean.predict(X_val_clean)
acc_clean = accuracy_score(y_val_clean, val_pred_clean)
print("🌟 Random Forest on CLEANED data - Validation Accuracy:", acc_clean)

🌟 Random Forest on CLEANED data - Validation Accuracy: 0.8379888268156425


Give the prediction submission based on Engineered Data

In [22]:
# Load cleaned test data
test_clean = pd.read_csv("test_cleaned.csv")
passenger_ids_clean = test_clean['PassengerId']
test_clean = test_clean.drop('PassengerId', axis=1)

# Predict and create submission
preds_clean = rf_clean.predict(test_clean)
submission_clean = pd.DataFrame({
    'PassengerId': passenger_ids_clean,
    'Survived': preds_clean
})
submission_clean.to_csv("submission/submission_rf_cleaned.csv", index=False)
print("✅ submission_rf_cleaned.csv saved.")

✅ submission_rf_cleaned.csv saved.


### Optimize the model

Grid Search with Cross-Validation

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5, 10],
}

# Grid search with 5-fold CV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_clean, y_clean)

print("✅ Best Parameters:", grid_search.best_params_)
print("✅ Best Cross-Validation Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
✅ Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
✅ Best Cross-Validation Accuracy: 0.8305316678174629


Retrain with best parameter

In [24]:
best_rf = grid_search.best_estimator_

# Train/test split
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)

best_rf.fit(X_train_opt, y_train_opt)
val_pred_opt = best_rf.predict(X_val_opt)

from sklearn.metrics import accuracy_score
print("🎯 Validation Accuracy (Optimized RF):", accuracy_score(y_val_opt, val_pred_opt))

🎯 Validation Accuracy (Optimized RF): 0.8491620111731844


Predict on Test Set (Cleaned)

In [25]:
# Load test data
test_clean = pd.read_csv("test_cleaned.csv")
passenger_ids_clean = test_clean['PassengerId']
test_clean = test_clean.drop('PassengerId', axis=1)

# Predict with optimized model
preds_opt = best_rf.predict(test_clean)

# Save submission
submission_opt = pd.DataFrame({
    'PassengerId': passenger_ids_clean,
    'Survived': preds_opt
})
submission_opt.to_csv("submission/submission_rf_optimized.csv", index=False)
print("✅ submission_rf_optimized.csv saved.")


✅ submission_rf_optimized.csv saved.
