In [None]:
import pandas as pd
from pandas import DataFrame
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score

from src.constants import SEX_HEADER, SURVIVED_HEADER, SEX_ENCODED, AGE_HEADER, PCLASS_HEADER


titanic_training_df: DataFrame = pd.read_csv("../data/train.csv")
titanic_training_df.head()

In [None]:
titanic_features_df: DataFrame = titanic_training_df[[SURVIVED_HEADER, SEX_HEADER, AGE_HEADER, PCLASS_HEADER]].copy()
titanic_features_df[SEX_ENCODED] = titanic_features_df[SEX_HEADER].map({'male': 0, 'female': 1})
titanic_features_df[AGE_HEADER] = titanic_features_df[AGE_HEADER].fillna(titanic_features_df[AGE_HEADER].median())
titanic_features_df.head()

In [None]:
decision_tree_overfitted = tree.DecisionTreeClassifier()
decision_tree_overfitted = decision_tree_overfitted.fit(
    titanic_features_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]],
    titanic_features_df[SURVIVED_HEADER]
)

In [None]:
tree.plot_tree(decision_tree_overfitted)

In [None]:
overfitted_train_predictions = decision_tree_overfitted.predict(titanic_features_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]])
overfitted_train_accuracy = accuracy_score(titanic_features_df[SURVIVED_HEADER], overfitted_train_predictions)

print(f"Overfitted Training accuracy: {overfitted_train_accuracy}")

In [None]:
decision_tree_fitted = tree.DecisionTreeClassifier(max_depth=5, min_samples_split=20, min_samples_leaf=10)
decision_tree_fitted = decision_tree_fitted.fit(
    titanic_features_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]],
    titanic_features_df[SURVIVED_HEADER]
)

In [None]:
tree.plot_tree(decision_tree_fitted)

In [None]:
fitted_train_predictions = decision_tree_fitted.predict(titanic_features_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]])
fitted_train_accuracy = accuracy_score(titanic_features_df[SURVIVED_HEADER], fitted_train_predictions)

print(f"Fitted Training accuracy: {fitted_train_accuracy}")

In [None]:
randomForestClassifier = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=20, min_samples_leaf=10)
randomForestClassifier.fit(titanic_features_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]], titanic_features_df[SURVIVED_HEADER])

In [None]:
tree.plot_tree(randomForestClassifier.estimators_[0])

In [None]:
forest_train_predictions = randomForestClassifier.predict(titanic_features_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]])
forest_train_accuracy = accuracy_score(titanic_features_df[SURVIVED_HEADER], forest_train_predictions)

print(f"Forest Training accuracy: {forest_train_accuracy}")

In [None]:
titanic_test_df: DataFrame = pd.read_csv("../data/test.csv")
titanic_test_df[SEX_ENCODED] = titanic_test_df[SEX_HEADER].map({'male': 0, 'female': 1})
titanic_test_df[AGE_HEADER] = titanic_test_df[AGE_HEADER].fillna(titanic_features_df[AGE_HEADER].median())
titanic_test_df.head()

In [None]:
random_forest_survival_prediction = randomForestClassifier.predict(titanic_test_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]])

In [None]:
random_forest_submission_df = pd.DataFrame({
    'PassengerId': titanic_test_df['PassengerId'],
    'Survived': random_forest_survival_prediction
})

random_forest_submission_df.to_csv('../data/random_forest_submission.csv', index=False)

random_forest_submission_df.head()

In [None]:
decision_tree_fitted_survival_prediction = decision_tree_fitted.predict(titanic_test_df[[SEX_ENCODED, AGE_HEADER, PCLASS_HEADER]])


In [None]:
decision_tree_fitted_submission_df = pd.DataFrame({
    'PassengerId': titanic_test_df['PassengerId'],
    'Survived': decision_tree_fitted_survival_prediction
})

decision_tree_fitted_submission_df.to_csv('../data/decision_tree_fitted_submission.csv', index=False)

decision_tree_fitted_submission_df.head()