# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Data Loading & Processing

Files Reading

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Added 'train_test' column to be able to distinguish between training and test dataframe
train_df['train_test'] = 1
test_df['train_test'] = 0

# Added 'Survived' column (To be predected)
test_df['Survived'] = np.NaN

# Join the training and test sets
all_data = pd.concat([train_df, test_df])

Training Data Analyzing

In [None]:
# Size
train_df.shape

In [None]:
# info
train_df.describe()

In [None]:
# Data types
print("Data types:\n", train_df.dtypes)

In [None]:
# Missing values checking
print("Missing values:\n", train_df.isnull().sum())

In [None]:
# separate columns
train_df.describe().columns

Separating Quantitative  and Qualitative values

In [None]:
Qn_val = train_df[['Age','SibSp','Parch','Fare']]

Ql_val = train_df[['Survived','Pclass','Sex','Ticket','Cabin','Embarked']]

# Handling Missing Data

In [None]:
# Imputing Qn_val and Ql_val
numerical_features = train_df.select_dtypes(include=[np.number]).columns
for feature in numerical_features:
    train_df[feature].fillna(train_df[feature].mean(), inplace=True)

categorical_features = train_df.select_dtypes(include=[object]).columns
for feature in categorical_features:
    train_df[feature].fillna(train_df[feature].mode()[0], inplace=True)

# Verifing no missing data
print("Missing values after handling:\n", train_df.isnull().sum())

# EDA

Numerical data distrutions

In [None]:
for i in Qn_val.columns:
    plt.hist(Qn_val[i])
    plt.title(i)
    plt.show()

Data Correlation

In [None]:
print(Qn_val.corr())
sns.heatmap(Qn_val.corr(), cmap='YlGn')

Survival Rate Analysis

In [None]:
# compare survival rate across Age, SibSp, Parch, and Fare 
pd.pivot_table(train_df, index = 'Survived', values = ['Age','SibSp','Parch','Fare'])

# preparing some pivot tables for Survived vs Pclass, Sex, and Embarked considering only survivors
survived_pclass = pd.pivot_table(train_df[train_df['Survived'] == 1], index='Pclass', aggfunc='size')
survived_sex = pd.pivot_table(train_df[train_df['Survived'] == 1], index='Sex', aggfunc='size')
survived_embarked = pd.pivot_table(train_df[train_df['Survived'] == 1], index='Embarked', aggfunc='size')

In [None]:
# Plotting bar charts for each categorical variable
plt.figure(figsize=(12, 5))

# Plotting bar chart for survival count comparison between passenger classes
plt.subplot(1, 3, 1)
survived_pclass.plot(kind='bar', color='purple')
plt.title('Survival Count by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Count')

# Plotting bar chart for survival count comparison between genders
plt.subplot(1, 3, 2)
survived_sex.plot(kind='bar', color='blue')
plt.title('Survival Count by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')

# Plotting bar chart for survival count comparison between embarkation ports
plt.subplot(1, 3, 3)
survived_embarked.plot(kind='bar', color='green')
plt.title('Survival Count by Embarked')
plt.xlabel('Embarked')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Feature Engineering

In [None]:
# Family Size Calculation
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
all_data['FamilySize'] = all_data['SibSp'] + all_data['Parch'] + 1

# Is Alone Calculation
train_df['is_alone'] = np.where(train_df['FamilySize'] == 1, True, False)
all_data['is_alone'] = np.where(all_data['FamilySize'] == 1, True, False)

# Categorizing Family Size
def categorize_family_size(size):
    if size == 1:
        return 'Alone'
    elif size <= 4:
        return 'Small'
    else:
        return 'Large'

train_df['family_size_category'] = train_df['FamilySize'].apply(categorize_family_size)
all_data['family_size_category'] = all_data['FamilySize'].apply(categorize_family_size)

# Impute nulls
all_data.Age = all_data.Age.fillna(train_df.Age.median())
all_data.Fare = all_data.Fare.fillna(train_df.Fare.median())

# Drop null 'embarked' rows
all_data.dropna(subset=['Embarked'], inplace=True)

# Convert fare to category for pd.get_dummies()
all_data.Pclass = all_data.Pclass.astype(str)

# Create dummy variables from categories
all_dummies = pd.get_dummies(all_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'is_alone', 'family_size_category', 'train_test']])

# Split into train and test sets to prepare the model
X_train = all_dummies[all_dummies.train_test == 1].drop(['train_test'], axis=1)
X_test = all_dummies[all_dummies.train_test == 0].drop(['train_test'], axis=1)

# Target variable
y_train = all_data[all_data.train_test == 1].Survived

Data Scaling

In [None]:
scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['Age','SibSp','Parch']]= scale.fit_transform(all_dummies_scaled[['Age','SibSp','Parch',]])
all_dummies_scaled

X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(['train_test'], axis =1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'], axis =1)

y_train = all_data[all_data.train_test==1].Survived

# Model Building

Model Body

In [None]:
# Spliting data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=50)


# Evaluation function
def evaluate_model(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


# Evaluation

Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_split, y_train_split)
y_pred_logreg = logreg.predict(X_val_split)
evaluate_model(y_val_split, y_pred_logreg, 'Logistic Regression')

Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_split, y_train_split)
y_pred_rf = rf.predict(X_val_split)
evaluate_model(y_val_split, y_pred_rf, "Random Forest")

XGBoost

In [None]:
xgboost = xgb.XGBClassifier()
xgboost.fit(X_train_split, y_train_split)
y_pred_xgboost = xgboost.predict(X_val_split)
evaluate_model(y_val_split, y_pred_xgboost, "XGBoost")

Result comparison

In [None]:
accuracies = []
model_names = []

# Logistic Regression
logreg_accuracy = accuracy_score(y_val_split, y_pred_logreg)
accuracies.append(logreg_accuracy*100)
model_names.append("Logistic Regression")

# Random Forest
rf_accuracy = accuracy_score(y_val_split, y_pred_rf)
accuracies.append(rf_accuracy*100)
model_names.append("Random Forest")

# XGBoost
xgboost_accuracy = accuracy_score(y_val_split, y_pred_xgboost)
accuracies.append(xgboost_accuracy*100)
model_names.append("XGBoost")

# Create bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies, palette='viridis')
plt.title('Accuracy of Each Algorithm @ Random State = 50')
plt.xlabel('Algorithm')
plt.ylabel('Accuracy %')
plt.ylim(60, 100)
plt.show()
