1. Load the Titanic dataset from an online source

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import os

PATH_DATA_FILE = 'Titanic_Dataset'
if not os.path.exists(PATH_DATA_FILE):
    os.makedirs(PATH_DATA_FILE)
    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    titanic = pd.read_csv(url)
    titanic.to_csv(os.path.join(PATH_DATA_FILE, 'Dataset'))
else:
    titanic = pd.read_csv(os.path.join(PATH_DATA_FILE, 'Dataset'))

print(titanic.head)

<bound method NDFrame.head of      Unnamed: 0  PassengerId  Survived  Pclass  \
0             0            1         0       3   
1             1            2         1       1   
2             2            3         1       3   
3             3            4         1       1   
4             4            5         0       3   
..          ...          ...       ...     ...   
886         886          887         0       2   
887         887          888         1       1   
888         888          889         0       3   
889         889          890         1       1   
890         890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel

2. Select relevant features and preprocess data

In [116]:
titanic_data = titanic[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Survived']].copy()
titanic_data.loc[:, 'Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())
titanic_data.loc[:, 'Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})

print(titanic_data)

     Pclass Sex   Age     Fare  SibSp  Parch  Survived
0         3   0  22.0   7.2500      1      0         0
1         1   1  38.0  71.2833      1      0         1
2         3   1  26.0   7.9250      0      0         1
3         1   1  35.0  53.1000      1      0         1
4         3   0  35.0   8.0500      0      0         0
..      ...  ..   ...      ...    ...    ...       ...
886       2   0  27.0  13.0000      0      0         0
887       1   1  19.0  30.0000      0      0         1
888       3   1  28.0  23.4500      1      2         0
889       1   0  26.0  30.0000      0      0         1
890       3   0  32.0   7.7500      0      0         0

[891 rows x 7 columns]


3. Split the data into features (X) and target (y)

In [117]:
X = titanic_data[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']]
y = titanic_data['Survived']

print(max(titanic_data['Fare']))

# Convert multiple columns to smaller integer types
titanic_data['Sex'] = titanic_data['Sex'].astype('int8')
titanic_data['Age'] = titanic_data['Age'].astype('int8')
titanic_data['Pclass'] = titanic_data['Pclass'].astype('int8')
titanic_data['SibSp'] = titanic_data['SibSp'].astype('int8')
titanic_data['Fare'] = titanic_data['Fare'].astype('float16')
titanic_data['Parch'] = titanic_data['Parch'].astype('int8')
titanic_data['Survived'] = titanic_data['Survived'].astype('int8')

print("Memory usage before conversion:")
# Check memory usage before conversion
print(f'{round(titanic.memory_usage(deep=True).sum() / 1024, 3)} KB')

print("Memory usage after conversion to int8:")
# Check memory usage after conversion
print(f'{round(titanic_data.memory_usage(deep=True).sum() / 1024, 3)} KB')

512.3292
Memory usage before conversion:
321.988 KB
Memory usage after conversion to int8:
7.086 KB


In [118]:
# 4. Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


5. Create and train the Random Forest model

In [119]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

6. Make predictions on the test set

In [120]:
y_pred = model.predict(X_test)

7. Evaluate the model

In [121]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

8. Print the evaluation metrics

In [122]:
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.7835820895522388
F1 Score: 0.7238095238095238
Precision: 0.7676767676767676
Recall: 0.6846846846846847


9. Generate random passenger data for prediction

In [123]:
# Random values for Pclass (1 to 3), Sex (0 for male, 1 for female), Age (1 to 80), Fare (random), SibSp (0 to 5), Parch (0 to 5)
random_passenger = {
    'Pclass': np.random.randint(1, 4),     # Ticket class: 1st, 2nd, or 3rd class
    'Sex': np.random.randint(0, 2),        # Sex: 0 = male, 1 = female
    'Age': np.random.uniform(1, 80),       # Age: random age between 1 and 80
    'Fare': np.random.uniform(5, 500),     # Fare: random fare between 5 and 500
    'SibSp': np.random.randint(0, 6),      # SibSp: number of siblings/spouses aboard
    'Parch': np.random.randint(0, 6)       # Parch: number of parents/children aboard
}

# Convert random passenger dictionary to a DataFrame
random_passenger_df = pd.DataFrame([random_passenger])

# 10. Predict survival for the random passenger
random_pred = model.predict(random_passenger_df)
predicted_survival = 'Survived' if random_pred[0] == 1 else 'Did not survive'

# 11. Print the generated random passenger and the prediction
print("\nRandomly generated passenger data:")
print(random_passenger_df)
print(f"\nPrediction for the random passenger: {predicted_survival}")


Randomly generated passenger data:
   Pclass  Sex        Age       Fare  SibSp  Parch
0       3    1  69.615422  273.86361      1      3

Prediction for the random passenger: Did not survive
