In [79]:
import kagglehub
path = kagglehub.dataset_download("yasserh/titanic-dataset")
print(f"Dataset downloaded to: {path}")

Dataset downloaded to: /home/manousos/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1


In [80]:
import os
files = os.listdir(path)
print(f"Files in the dataset directory: {files}")       

csv_file = [file for file in files if file.endswith('.csv')][0]
csv_path = os.path.join(path, csv_file)
print(f"CSV file path: {csv_path}")

Files in the dataset directory: ['Titanic-Dataset.csv']
CSV file path: /home/manousos/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1/Titanic-Dataset.csv


In [81]:
import pandas as pd

data = pd.read_csv(csv_path)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [82]:
# Handle missing values
print(data.isna().sum()) # Check for missing values
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True) # Fill missing 'Embarked' with mode (most frequent value)
data['Age'].fillna(data['Age'].median(), inplace=True) # Fill missing 'Age' with median age


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True) # Fill missing 'Embarked' with mode (most frequent value)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True) # Fill missing 'Age' with median age


In [83]:
# Encode categorical variables
categorical_cols = data.select_dtypes(include='object').columns
print(f"Categorical columns: {categorical_cols.tolist()}")

data['Sex'] = data['Sex'].map({'male': 0, 'female': 1}).astype(int)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Cabin'] = data['Cabin'].notnull().astype(int) # Convert 'Cabin' to binary feature: 0 if missing, 1 if present
data.drop('Ticket', axis=1, inplace=True)
data.drop('Name', axis=1, inplace=True)

Categorical columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [84]:
# Feature engineering:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = 1
data.loc[data['FamilySize'] > 1, 'IsAlone'] = 0
data['AgeGroup'] = pd.cut(data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=[0, 1, 2, 3, 4])
print(data.head())


   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  \
0            1         0       3    0  22.0      1      0   7.2500      0   
1            2         1       1    1  38.0      1      0  71.2833      1   
2            3         1       3    1  26.0      0      0   7.9250      0   
3            4         1       1    1  35.0      1      0  53.1000      1   
4            5         0       3    0  35.0      0      0   8.0500      0   

   Embarked  FamilySize  IsAlone AgeGroup  
0         0           2        0        2  
1         1           2        0        2  
2         0           1        1        2  
3         0           2        0        2  
4         0           1        1        2  


In [85]:
# Scale only continuous numerical features
numeric_features_to_scale = ['Age', 'Fare', 'SibSp', 'Parch']
print(f"Numerical features: {data.select_dtypes(include=['int64', 'float64']).columns.tolist()}")
print(f"Numerical features to be scaled: {numeric_features_to_scale}")

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() # StandardScaler for standardization
data[numeric_features_to_scale] = scaler.fit_transform(data[numeric_features_to_scale])
print(data.head())

Numerical features: ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'IsAlone']
Numerical features to be scaled: ['Age', 'Fare', 'SibSp', 'Parch']
   PassengerId  Survived  Pclass  Sex       Age     SibSp     Parch      Fare  \
0            1         0       3    0 -0.565736  0.432793 -0.473674 -0.502445   
1            2         1       1    1  0.663861  0.432793 -0.473674  0.786845   
2            3         1       3    1 -0.258337 -0.474545 -0.473674 -0.488854   
3            4         1       1    1  0.433312  0.432793 -0.473674  0.420730   
4            5         0       3    0  0.433312 -0.474545 -0.473674 -0.486337   

   Cabin  Embarked  FamilySize  IsAlone AgeGroup  
0      0         0           2        0        2  
1      1         1           2        0        2  
2      0         0           1        1        2  
3      1         0           2        0        2  
4      0         0           1        1        