# Titanic - Feature Engineering Title

Not only remove the Name column (because it contains too much noise information),
but extract a new feature called Title – for example: Mr, Mrs, Miss, Master, Dr...
→ Help the model classify passengers better (because the title reflects gender, age, social status).

In [6]:
import os
import pandas as pd


# Load dataset
path_dir = os.path.join("..", "..", "data")
df = pd.read_csv(os.path.join(path_dir, "preprocessed", "preprocessed_train.csv"))
df_test = pd.read_csv(os.path.join(path_dir, "preprocessed", "preprocessed_test.csv"))

# --- FEATURE ENGINEERING: Extract Title from Name ---
def extract_title(df):
    # Extract title using regex: space + letters + dot
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Replace rare titles with 'Rare'
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    
    # Standardize titles
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    return df

# Apply to both train and test
df = extract_title(df)
df_test = extract_title(df_test)

# Optional: Drop 'Name' after extracting Title (recommended to reduce noise)
df = df.drop(['Name'], axis=1)
df_test = df_test.drop(['Name'], axis=1)

# Optional: You can also drop 'Ticket' if not used later
df = df.drop(['Ticket'], axis=1, errors='ignore')
df_test = df_test.drop(['Ticket'], axis=1, errors='ignore')

# --- Save engineered datasets ---
output_dir = os.path.join(path_dir, "feature_engineered", "title")
os.makedirs(output_dir, exist_ok=True)  # Create directory if not exists

df.to_csv(os.path.join(output_dir, "title_engineered_train.csv"), index=False)
df_test.to_csv(os.path.join(output_dir, "title_engineered_test.csv"), index=False)


df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,1,22.0,1,0,7.25,2,Mr
1,2,1,1,0,38.0,1,0,71.2833,0,Mrs
2,3,1,3,0,26.0,0,0,7.925,2,Miss
3,4,1,1,0,35.0,1,0,53.1,2,Mrs
4,5,0,3,1,35.0,0,0,8.05,2,Mr
