In [83]:
# Import needed libraries and load cleaned data for feature engineering
import sys
notebook_path = sys.path.append(r'e:\Data science\Titanic dataset\notebooks')

from auto_imports import *

df = pd.read_csv('E:\Data science\Titanic dataset\data\Processed data\Data Analysis\cleaned data.csv')

### Feature engineering

This notebook extracts new features from the Titanic dataset, such as splitting the Cabin, Name, and Ticket columns into more informative components.

In [84]:
# Show first rows of dataframe to understand the structure
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ind
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C23 C25 C27,S,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C23 C25 C27,S,train
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,C23 C25 C27,S,train


#### Cabin Feature Engineering

Split the Cabin column into two new features: cabin cell number and cabin letter.

In [85]:
# Show Cabin column values for inspection
df['Cabin']

0       C23 C25 C27
1               C85
2       C23 C25 C27
3              C123
4       C23 C25 C27
           ...     
1304    C23 C25 C27
1305           C105
1306    C23 C25 C27
1307    C23 C25 C27
1308    C23 C25 C27
Name: Cabin, Length: 1309, dtype: object

In [89]:
# Function to remove duplicate letters from cabin string
def remove_duplicate_letters(text):
        seen = set()
        result = ''
        for char in text:
            if char not in seen:
                seen.add(char)
                result += char
        return result.strip().replace(' ' , '-')

# Splitting cabin column to cell number and cabin letter
cell_number_count = df['Cabin'].replace(r'[a-zA-Z]' , '' , regex =True).str.strip().str.split().str.len()
print(cell_number_count.head())
df['cell_number_count'] = cell_number_count
df['Cabin_letter'] = df['Cabin'].replace(r'[0-9]' , '' , regex = True).apply(remove_duplicate_letters)
# When removing digits from cabin column values output may be like that (B B B B) so I made a function to remove these duplicated letters

0    3
1    1
2    3
3    1
4    3
Name: Cabin, dtype: int64


In [91]:
# Show unique cabin letters to check extraction
df['Cabin_letter'].unique() 

array(['C', 'E', 'G', 'D', 'A', 'B', 'F', 'F-G', 'F-E', 'T'], dtype=object)

In [93]:
# Show unique cell numbers to check extraction
df['cell_number_count'].unique()

array([3, 1, 2, 0, 4])

In [95]:
# Count missing cell numbers
df['cell_number_count'].isna().sum()

np.int64(0)

In [96]:
# Drop Cabin column after splitting
df.drop('Cabin' , axis =1 , inplace=True)

In [97]:
# Show first rows after cabin processing
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,ind,cell_number_count,Cabin_letter
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,train,3,C
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,train,1,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,train,3,C
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,train,1,C
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,train,3,C


#### Name Feature Engineering

Extract family name and title from the Name column, then drop the original Name column.

In [98]:
# Show first name value for inspection
df['Name'][0]

'Braund, Mr. Owen Harris'

- Extract family name from the first word and title from the second word in the Name column.
- Check for names with brackets, which may indicate maiden names or nicknames.

#### Title Extraction

Extract titles from the Name column for further analysis.

In [102]:
# Show value counts of titles extracted from Name
titles = df['Name'].str.split(r'[,.]').str[1].str.strip()
titles.value_counts()

Name
Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Major             2
Mlle              2
Ms                2
Mme               1
Don               1
Sir               1
Lady              1
Capt              1
the Countess      1
Jonkheer          1
Dona              1
Name: count, dtype: int64

In [109]:
# Ok there are many titles so I will combine rare titles in new category called others
others = titles.value_counts()[titles.value_counts() < 61]
titles = titles.replace(others.index , 'Others') # replace all titles are less than 61

titles.value_counts()

Name
Mr        757
Miss      260
Mrs       197
Master     61
Others     34
Name: count, dtype: int64

In [None]:
# Extract Title from Name
df['Title'] = titles

#### Extracting family size and  is Alone or not

In [None]:
familysize =  df['Parch'] + df['SibSp'] # family size from this feature I will get is alone or not

df['FamilySize'] = familysize
df['Is Alone'] = (df['FamilySize'] == 0).astype(int) # if family size is 0 that means he/she is alone

In [None]:
df.drop('Name' , axis = 1 , inplace=True)

In [None]:
# Show first rows after name processing
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,ind,cell_number,Cabin_letter,Title,FamilySize,Is Alone
0,1,0.0,3,male,22.0,1,0,A/5 21171,7.25,S,train,23-25-27,C,Mr,1,0
1,2,1.0,1,female,38.0,1,0,PC 17599,71.2833,C,train,85,C,Mrs,1,0
2,3,1.0,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,train,23-25-27,C,Miss,0,1
3,4,1.0,1,female,35.0,1,0,113803,53.1,S,train,123,C,Mrs,1,0
4,5,0.0,3,male,35.0,0,0,373450,8.05,S,train,23-25-27,C,Mr,0,1


#### Ticket Feature Engineering

Extract special agent/route, ticket batch, and shared ticket indicators from the Ticket column.

In [None]:
# Split Ticket column for further processing
df['Ticket'].str.split()

0                [A/5, 21171]
1                 [PC, 17599]
2         [STON/O2., 3101282]
3                    [113803]
4                    [373450]
                ...          
1304             [A.5., 3236]
1305              [PC, 17758]
1306    [SOTON/O.Q., 3101262]
1307                 [359309]
1308                   [2668]
Name: Ticket, Length: 1309, dtype: object

In [None]:
# Extract special agent/route from Ticket
clean_agents = df['Ticket'].str.extract(r'^([A-Za-z/.]+\d*)', expand=False).fillna('Non-special').replace('[.]' , '' , regex=True)

In [None]:
# Show unique special agent/route values
clean_agents.unique()

# That's perfect

array(['A/5', 'PC', 'STON/O2', 'Non-special', 'PP', 'CA', 'SC/Paris',
       'SC/A4', 'A/4', 'SP', 'SOC', 'SO/C', 'W/C', 'SOTON/OQ', 'WEP',
       'STON/O', 'A4', 'C', 'SC/PARIS', 'SOP', 'A5', 'Fa', 'LINE', 'FCC',
       'SW/PP', 'SCO/W', 'P/PP', 'SC', 'SC/AH', 'A/S', 'WE/P', 'SO/PP',
       'FC', 'SOTON/O2', 'CA/SOTON', 'SC/A3', 'STON/OQ', 'AQ/4', 'A',
       'LP', 'AQ/3'], dtype=object)

In [None]:
# Add special agent/route column
df['Special Agent/route'] = clean_agents 

Create a TicketBatch column to indicate early or late ticket batches based on ticket digit length.

In [None]:
# Extract ticket digits for batch classification
ticket_digits = df['Ticket'].str.split().str[-1]
ticket_digits.str.len().unique()

array([5, 7, 6, 4, 3, 1])

In [None]:
# Show ticket digits with length 1 for correction
ticket_digits[ticket_digits.str.len() == 1]

772     3
841     3
1077    2
1193    2
Name: Ticket, dtype: object

In [None]:
# Get corrected values for tickets with length 1
corrected_values = df.iloc[ticket_digits[ticket_digits.str.len() == 1].index]['Ticket'].values
corrected_values

array(['S.O./P.P. 3', 'S.O./P.P. 3', 'S.O./P.P. 2', 'S.O./P.P. 2'],
      dtype=object)

In [None]:
# Assign corrected values to ticket_digits
ticket_digits[ticket_digits.str.len() == 1] = corrected_values

In [None]:
# Check unique lengths after correction
ticket_digits.str.len().unique()

array([ 5,  7,  6,  4,  3, 11])

In [None]:
# Show unique ticket digits
ticket_digits.unique()

array(['21171', '17599', '3101282', '113803', '373450', '330877', '17463',
       '349909', '347742', '237736', '9549', '113783', '2151', '347082',
       '350406', '248706', '382652', '244373', '345763', '2649', '239865',
       '248698', '330923', '113788', '347077', '2631', '19950', '330959',
       '349216', '17601', '17569', '335677', '24579', '17604', '113789',
       '2677', '2152', '345764', '2651', '7546', '11668', '349253',
       '2123', '330958', '23567', '370371', '14311', '2662', '349237',
       '3101295', '39886', '17572', '2926', '113509', '19947', '31026',
       '2697', '34651', '2144', '2669', '113572', '36973', '347088',
       '17605', '2661', '29395', '3464', '3101281', '315151', '33111',
       '14879', '2680', '1601', '348123', '349208', '374746', '248738',
       '364516', '345767', '345779', '330932', '113059', '14885',
       '3101278', '6608', '392086', '343275', '343276', '347466', '5734',
       '2315', '364500', '374910', '17754', '17759', '231919', '244

In [None]:
# Create TicketBatch column: 1 for early, 0 for late
df['TicketBatch'] = np.where(ticket_digits.str.len() < 5 , 1,0)

1: Early<br>
0: Late

Create a boolean column called SharedTicket to indicate if a ticket is shared by more than one passenger.

In [None]:
# Create SharedTicket column: 1 if ticket is duplicated, else 0
df['SharedTicket'] = np.where(df['Ticket'].duplicated() , 1 , 0)

In [None]:
# Extract first digit from Ticket (not used further)
first_digit_ticket = df['Ticket'].str.extract(r'([0-9]+)')[0]
first_digit_ticket

0             5
1         17599
2             2
3        113803
4        373450
         ...   
1304          5
1305      17758
1306    3101262
1307     359309
1308       2668
Name: 0, Length: 1309, dtype: object

In [None]:
# Show first 5 Ticket values for inspection
df['Ticket'].head()

0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object

In [None]:
# Drop Ticket column after feature extraction
df.drop('Ticket' , axis = 1 , inplace=True)

In [None]:
# Show random sample of dataframe to inspect new features
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,ind,cell_number,Cabin_letter,Title,FamilySize,Is Alone,Special Agent/route,TicketBatch,SharedTicket
608,609,1.0,2,female,22.0,1,2,41.5792,C,train,23-25-27,C,Mrs,3,0,SC/Paris,1,1
746,747,0.0,3,male,16.0,1,1,20.25,S,train,23-25-27,C,Mr,2,0,CA,1,1
556,557,1.0,1,female,48.0,1,0,39.6,C,train,16,A,Lady,1,0,Non-special,0,0
666,667,0.0,2,male,25.0,0,0,13.0,S,train,23-25-27,C,Mr,0,1,Non-special,0,0
81,82,1.0,3,male,29.0,0,0,9.5,S,train,23-25-27,C,Mr,0,1,Non-special,0,0
219,220,0.0,2,male,30.0,0,0,10.5,S,train,23-25-27,C,Mr,0,1,W/C,0,0
1042,1043,,3,male,29.881138,0,0,7.8958,C,test,23-25-27,C,Mr,0,1,Non-special,0,0
1022,1023,,1,male,53.0,0,0,28.5,C,test,51,C,Col,0,1,Non-special,0,0
944,945,,1,female,28.0,3,2,263.0,S,test,23-25-27,C,Miss,5,0,Non-special,0,1
586,587,0.0,2,male,47.0,0,0,15.0,S,train,23-25-27,C,Mr,0,1,Non-special,0,0


In [None]:
# Show dataframe info to check new columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PassengerId          1309 non-null   int64  
 1   Survived             891 non-null    float64
 2   Pclass               1309 non-null   int64  
 3   Sex                  1309 non-null   object 
 4   Age                  1309 non-null   float64
 5   SibSp                1309 non-null   int64  
 6   Parch                1309 non-null   int64  
 7   Fare                 1309 non-null   float64
 8   Embarked             1307 non-null   object 
 9   ind                  1309 non-null   object 
 10  cell_number          1309 non-null   object 
 11  Cabin_letter         1309 non-null   object 
 12  Title                1309 non-null   object 
 13  FamilySize           1309 non-null   int64  
 14  Is Alone             1309 non-null   int64  
 15  Special Agent/route  1309 non-null   o

In [None]:
# Save the processed dataframe with new features
df.to_csv('E:\Data science\Titanic dataset\data\Processed data\Data Analysis\processed_data.csv' , index=False)

#### Summary

- Cabin, Name, and Ticket columns have been split into more informative features.
- The processed data is now ready for further analysis or modeling.