In [2]:
import pandas as pd
import re  
def clean_and_engineer_features(data):
    data = data.copy()  
    data["Age"] = data["Age"].fillna(data["Age"].median())
    data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0])
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())
    data = data.drop(columns=["Cabin"], errors='ignore')
    data["Sex"] = data["Sex"].map({"male": 1, "female": 0})
    data = pd.get_dummies(data, columns=["Embarked"], drop_first=True)
    data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
    data["Title"] = data["Name"].apply(lambda x: re.search(r' ([A-Za-z]+)\.', x).group(1) if re.search(r' ([A-Za-z]+)\.', x) else "Other")
    common_titles = ["Mr", "Miss", "Mrs", "Master"]
    data["Title"] = data["Title"].apply(lambda x: x if x in common_titles else "Other")
    data = data.drop(columns=["Name", "Ticket", "PassengerId"], errors='ignore')
    return data


In [4]:
from sklearn.preprocessing import StandardScaler
titanic_file_path = "titanic_dataset.csv"
df = pd.read_csv(titanic_file_path)
df = clean_and_engineer_features(df)
numeric_features = df.select_dtypes(include=["number"])
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(numeric_features), columns=numeric_features.columns)
print("Feature Scaling Completed. All features are now scaled from -1 to 1!\n")
print(df_scaled.head())


Feature Scaling Completed. All features are now scaled from -1 to 1!

   Survived    Pclass       Sex       Age     SibSp     Parch      Fare  \
0 -0.755929  0.873482  0.755929  0.386231 -0.499470 -0.400248 -0.497413   
1  1.322876  0.873482 -1.322876  1.371370  0.616992 -0.400248 -0.512278   
2 -0.755929 -0.315819  0.755929  2.553537 -0.499470 -0.400248 -0.464100   
3 -0.755929  0.873482  0.755929 -0.204852 -0.499470 -0.400248 -0.482475   
4  1.322876  0.873482 -1.322876 -0.598908  0.616992  0.619896 -0.417492   

   FamilySize  
0   -0.553443  
1    0.105643  
2   -0.553443  
3   -0.553443  
4    0.764728  
