#### Mohammad Zubair Hussain

# Titanic Survival Prediction with Random Forest

This notebook demonstrates how to build a Random Forest model to predict the survival of passengers on the Titanic.

## 1. Load and Prepare Data

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import joblib

In [49]:
# Load the preprocessed data
train_data = pd.read_csv('../Titanic/data/titanic/processed_train.csv')
test_data = pd.read_csv('../Titanic/data/titanic/processed_test.csv')

In [50]:
# Define Features and Target
X = train_data.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket'])
y = train_data['Survived']

In [51]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 2. Encode Categorical Features

In [52]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [53]:
# Fit and transform the categorical features
X_train_encoded = encoder.fit_transform(X_train[['AgeGroup', 'FareRange', 'FarePerClassRange']])
X_test_encoded = encoder.transform(X_test[['AgeGroup', 'FareRange', 'FarePerClassRange']])

In [54]:
# Convert sparse matrix to dense array
X_train_encoded = X_train_encoded.toarray()
X_test_encoded = X_test_encoded.toarray()

# Drop original categorical columns and concatenate encoded features
X_train = X_train.drop(columns=['AgeGroup', 'FareRange', 'FarePerClassRange'])
X_test = X_test.drop(columns=['AgeGroup', 'FareRange', 'FarePerClassRange'])
X_train = np.concatenate((X_train, X_train_encoded), axis=1)
X_test = np.concatenate((X_test, X_test_encoded), axis=1)


## 3. Train the Random Forest Model

In [55]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

In [56]:
# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

### 4. Evaluate Model Performance

In [57]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8324


In [58]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       105
           1       0.79      0.81      0.80        74

    accuracy                           0.83       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.83      0.83      0.83       179



### 5. Save the Model

In [None]:
# Save the trained model
joblib.dump(rf_model, '../Titanic/data/titanic/titanic_rf_model.pkl')

['titanic_rf_model.pkl']

In [59]:
# Prepare test data for prediction
X_test_data = test_data.drop(columns=['PassengerId', 'Name', 'Ticket'])

### 6. Predict on Test Data and Create Submission

In [60]:
# Encode categorical features in test data
X_test_data_encoded = encoder.transform(X_test_data[['AgeGroup', 'FareRange', 'FarePerClassRange']]).toarray()
X_test_data = X_test_data.drop(columns=['AgeGroup', 'FareRange', 'FarePerClassRange'])
X_test_data = np.concatenate((X_test_data, X_test_data_encoded), axis=1)

In [61]:
# Make predictions on the test data
test_predictions = rf_model.predict(X_test_data)

In [None]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions
})

# Save the predictions to a CSV file
submission.to_csv('../Titanic/data/titanic/titanic_submission_manual.csv', index=False)