In [None]:
import kagglehub
path = kagglehub.dataset_download("yasserh/titanic-dataset")
print(f"Dataset downloaded to: {path}")

In [None]:
import os
import shutil

files = os.listdir(path)
print(f"Files in the dataset directory: {files}")

csv_file = [file for file in files if file.endswith('.csv')][0]

DATA_DIR = '../data/raw'

os.makedirs(DATA_DIR, exist_ok=True)

source_csv_path = os.path.join(path, csv_file)
print(f"Source CSV file path: {source_csv_path}\n")

target_csv_path = os.path.join(DATA_DIR, csv_file)
print(f"CSV file path: {target_csv_path}")

shutil.copy(source_csv_path, target_csv_path)


In [None]:
import pandas as pd
data = pd.read_csv(target_csv_path)
print(data.info())

In [None]:
# Handle missing values
print(data.isna().sum()) # Check for missing values
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True) # Fill missing 'Embarked' with mode (most frequent value)
data['Age'].fillna(data['Age'].median(), inplace=True) # Fill missing 'Age' with median age


In [None]:
# Encode categorical variables
categorical_cols = data.select_dtypes(include='object').columns
print(f"Categorical columns: {categorical_cols.tolist()}")

data['Sex'] = data['Sex'].map({'male': 0, 'female': 1}).astype(int)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Cabin'] = data['Cabin'].notnull().astype(int) # Convert 'Cabin' to binary feature: 0 if missing, 1 if present
data.drop('Ticket', axis=1, inplace=True)
data.drop('Name', axis=1, inplace=True)

In [None]:
# Feature engineering:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = 1
data.loc[data['FamilySize'] > 1, 'IsAlone'] = 0
data['AgeGroup'] = pd.cut(data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=[0, 1, 2, 3, 4])
print(data.head())


In [None]:
# Scale only continuous numerical features
numeric_features_to_scale = ['Age', 'Fare', 'SibSp', 'Parch']
print(f"Numerical features: {data.select_dtypes(include=['int64', 'float64']).columns.tolist()}")
print(f"Numerical features to be scaled: {numeric_features_to_scale}")

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() # StandardScaler for standardization
data[numeric_features_to_scale] = scaler.fit_transform(data[numeric_features_to_scale])
print(data.head())

In [None]:
# Implementing EDA (Exploratory Data Analysis) to visualize data relationships and see patterns
import matplotlib.pyplot as plt
import seaborn as sns

# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
data["Survived_Label"] = data["Survived"].map({0: 'Did Not Survive', 1: 'Survived'})

#Pairplot to visualize relationships
sns.pairplot(data, hue='Survived_Label')
plt.suptitle("Pairplot of Titanic Dataset", y=1.02)
plt.show()


In [None]:
# I see above that the features below either do not contribute to the analysis or have been already encoded in other features (due to redundancy).

data.drop(['Survived_Label','SibSp','Parch','AgeGroup','PassengerId'], axis=1, inplace=True)
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()




In [None]:
# Applying log transformation to 'Fare', 'Age', and 'FamilySize' to reduce skewness
import numpy as np

data['Fare'] = data['Fare'].apply(lambda x: np.log1p(x) if x > 0 else 0)
data['Age'] = data['Age'].apply(lambda x: np.log1p(x) if x > 0 else 0)
data['FamilySize'] = data['FamilySize'].apply(lambda x: np.log1p(x) if x > 0 else 0)



In [None]:
# Save the preprocessed data
DATA_DIR = '../datasets/'
preprocessed_path = os.path.join(DATA_DIR, 'titanic_preprocessed.csv')
data.to_csv(preprocessed_path, index=False)
print(f"Preprocessed data saved to: {preprocessed_path}") # Access the preprocessed data path