# Library

In [23]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Data Processing

In [10]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Handle missing values
# For Age: Fill with median value
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

# For Embarked in training set: Fill with the most frequent value
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# For Fare in test set: Fill with median value of training set
test_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

# Drop columns that might not be immediately useful
drop_columns = ['Name', 'Ticket', 'Cabin']
train_df = train_df.drop(columns=drop_columns)
test_df = test_df.drop(columns=drop_columns)

# Convert categorical variables to numeric using one-hot encoding
encoder = OneHotEncoder(drop='first', sparse=False)  # Drop first to avoid dummy variable trap

# For training data
train_encoded = encoder.fit_transform(train_df[['Sex', 'Embarked']])
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Sex', 'Embarked']))
train_df = pd.concat([train_df, train_encoded_df], axis=1).drop(columns=['Sex', 'Embarked'])

# For test data
test_encoded = encoder.transform(test_df[['Sex', 'Embarked']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Sex', 'Embarked']))
test_df = pd.concat([test_df, test_encoded_df], axis=1).drop(columns=['Sex', 'Embarked'])

train_df.head()

"""

PassengerId: Identifier for each passenger.
Survived: Target variable (1 for survived, 0 for deceased).
Pclass: Ticket class.
Age: Age of the passenger.
SibSp: Number of siblings/spouses aboard.
Parch: Number of parents/children aboard.
Fare: Ticket fare.
Sex_male: Binary indicator for male gender (1 for male, 0 for female).
Embarked_Q: Binary indicator for embarkation from Queenstown.
Embarked_S: Binary indicator for embarkation from Southampton.

"""



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,1.0,0.0,1.0
1,2,1,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,3,1,3,26.0,0,0,7.925,0.0,0.0,1.0
3,4,1,1,35.0,1,0,53.1,0.0,0.0,1.0
4,5,0,3,35.0,0,0,8.05,1.0,0.0,1.0


# Model Selection and Training

In [18]:
# Splitting the data into training and validation sets
X = train_df.drop(columns=['PassengerId', 'Survived'])
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training Logistic Regression
logreg = LogisticRegression(max_iter=500, random_state=42)
logreg.fit(X_train, y_train)
logreg_val_predictions = logreg.predict(X_val)
logreg_val_accuracy = accuracy_score(y_val, logreg_val_predictions)

# Training Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_val_predictions = rf.predict(X_val)
rf_val_accuracy = accuracy_score(y_val, rf_val_predictions)

print(f"Logistic regression accuracy: {logreg_val_accuracy}")
print(f"Random forest accuracy: {rf_val_accuracy}")


Logistic regression accuracy: 0.8100558659217877
Random forest accuracy: 0.8212290502793296


In [21]:
# Checking overfitting by comparing training and test accuracy
rf_train_predictions = rf.predict(X_train)
rf_train_accuracy = accuracy_score(y_train, rf_train_predictions)

# Getting feature importance from the Random Forest model
feature_importance = rf.feature_importances_
features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

rf_train_accuracy, rf_val_accuracy, features_df


(0.9789325842696629,
 0.8212290502793296,
       Feature  Importance
 5    Sex_male    0.273316
 4        Fare    0.272058
 1         Age    0.252745
 0      Pclass    0.078616
 2       SibSp    0.052192
 3       Parch    0.038490
 7  Embarked_S    0.023095
 6  Embarked_Q    0.009488)

In [22]:
# Removing less important features
X_train_reduced = X_train.drop(columns=['Embarked_S', 'Embarked_Q'])
X_val_reduced = X_val.drop(columns=['Embarked_S', 'Embarked_Q'])

# Retraining the adjusted Random Forest model on the reduced dataset
rf_adjusted.fit(X_train_reduced, y_train)

# Checking performance on the reduced validation set
rf_adjusted_val_predictions_reduced = rf_adjusted.predict(X_val_reduced)
rf_adjusted_val_accuracy_reduced = accuracy_score(y_val, rf_adjusted_val_predictions_reduced)

rf_adjusted_val_accuracy_reduced


0.8044692737430168

In [19]:
# Retraining the Random Forest with adjusted hyperparameters to reduce overfitting
rf_adjusted = RandomForestClassifier(random_state=42, max_depth=10, min_samples_split=10)
rf_adjusted.fit(X_train, y_train)

# Checking performance on training and validation sets
rf_adjusted_train_predictions = rf_adjusted.predict(X_train)
rf_adjusted_val_predictions = rf_adjusted.predict(X_val)

rf_adjusted_train_accuracy = accuracy_score(y_train, rf_adjusted_train_predictions)
rf_adjusted_val_accuracy = accuracy_score(y_val, rf_adjusted_val_predictions)

rf_adjusted_train_accuracy, rf_adjusted_val_accuracy

(0.8946629213483146, 0.8212290502793296)

In [26]:

# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15, 20]
}

# Initialize the Random Forest model
rf_grid = RandomForestClassifier(random_state=42)

# Set up the grid search
grid_search = GridSearchCV(rf_grid, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
best_val_accuracy = grid_search.best_score_

best_params, best_val_accuracy


({'max_depth': 5, 'min_samples_split': 2}, 0.8356446370530877)

In [27]:
# Training the Random Forest model with the optimal hyperparameters
rf_optimal = RandomForestClassifier(max_depth=5, min_samples_split=2, random_state=42)
rf_optimal.fit(X_train, y_train)

# Evaluating the model on the validation set
rf_optimal_val_predictions = rf_optimal.predict(X_val)
rf_optimal_val_accuracy = accuracy_score(y_val, rf_optimal_val_predictions)

rf_optimal_val_accuracy


0.8156424581005587

In [34]:
# Making predictions on the provided test set using the Random Forest Classifier
test_data = test_df.drop(columns=['PassengerId'])
rf_test_predictions = rf.predict(test_data)

# Making predictions on the provided test set using the optimal Random Forest model
rf_optimal_test_predictions = rf_optimal.predict(test_data)

# Creating the submission dataframe
submission_optimal_df = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": rf_optimal_test_predictions
})

submission_optimal_df.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [35]:
# Save the submission dataframe to a CSV file
submission_file_path = "/Users/dragonsave/Documents/Master behavior finance/Kaggle/Titanic Challenge/titanic_submission.csv"
submission_optimal_df.to_csv(submission_file_path, index=False)

submission_file_path


'/Users/dragonsave/Documents/Master behavior finance/Kaggle/Titanic Challenge/titanic_submission.csv'