In [3]:
import pandas as pd
import re  # Import regex module for safer pattern matching

# Load the Titanic dataset
titanic_file_path = "titanic_dataset.csv"  # Ensure this file is in your working directory
df = pd.read_csv(titanic_file_path)

# Function to clean and engineer features
def clean_and_engineer_features(data):
    data = data.copy()  # Avoids SettingWithCopyWarning
    
    # Fill missing 'Age' with median
    data["Age"] = data["Age"].fillna(data["Age"].median())
    
    # Fill missing 'Embarked' with the most common value
    data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0])
    
    # Fill missing 'Fare' with the median
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())
    
    # Drop 'Cabin' column due to too many missing values
    data = data.drop(columns=["Cabin"], errors='ignore')
    
    # Convert 'Sex' to numerical (Male=1, Female=0)
    data["Sex"] = data["Sex"].map({"male": 1, "female": 0})
    
    # Convert 'Embarked' to numerical (One-Hot Encoding)
    data = pd.get_dummies(data, columns=["Embarked"], drop_first=True)
    
    # Create a new feature: Family Size (SibSp + Parch + 1)
    data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
    
    # Create a new feature: Title extracted from Name (Fixed regex issue)
    data["Title"] = data["Name"].apply(lambda x: re.search(r' ([A-Za-z]+)\.', x).group(1) if re.search(r' ([A-Za-z]+)\.', x) else "Other")
    
    common_titles = ["Mr", "Miss", "Mrs", "Master"]
    data["Title"] = data["Title"].apply(lambda x: x if x in common_titles else "Other")
    
    # Drop unnecessary columns
    data = data.drop(columns=["Name", "Ticket", "PassengerId"], errors='ignore')
    
    return data

# Apply feature engineering
df = clean_and_engineer_features(df)

# Display cleaned dataset print statement
print("Feature Engineering Completed. All feature columns cleaned and ready!\n")
print(df.head())


Feature Engineering Completed. All feature columns cleaned and ready!

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked_Q  Embarked_S  \
0         0       3    1  34.5      0      0   7.8292        True       False   
1         1       3    0  47.0      1      0   7.0000       False        True   
2         0       2    1  62.0      0      0   9.6875        True       False   
3         0       3    1  27.0      0      0   8.6625       False        True   
4         1       3    0  22.0      1      1  12.2875       False        True   

   FamilySize Title  
0           1    Mr  
1           2   Mrs  
2           1    Mr  
3           1    Mr  
4           3   Mrs  
