In [9]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset
file_path = 'DSB_Day1_Titanic_train.csv'  
data = pd.read_csv(file_path)
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [10]:
missing_values_count = data.isnull().sum()
print(" Missing total values before:  ")
print(missing_values_count)
imputer_age_fare = SimpleImputer(strategy='median')
data[['Age', 'Fare']] = imputer_age_fare.fit_transform(data[['Age', 'Fare']])
missing_values_count = data.isnull().sum()
print(" Missing values after:  ")
print(missing_values_count)

 Missing total values before:  
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
 Missing values after:  
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [11]:
missing_values_embarked = data['Embarked'].isnull().sum()
print(" Missing of Embarked values before:  ")
print(missing_values_embarked)

imputer_embarked = SimpleImputer(strategy='most_frequent')
data['Embarked'] = imputer_embarked.fit_transform(data[['Embarked']])


missing_values_embarked = data['Embarked'].isnull().sum()
print(" Missing of Embarked values after:  ")
print(missing_values_embarked)

 Missing of Embarked values before:  
2
 Missing of Embarked values after:  
0


In [12]:
print("Before adding 'FamilySize':")
print(data[['SibSp', 'Parch']].head())
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
print("\nAfter adding 'FamilySize':")
print(data[['SibSp', 'Parch', 'FamilySize']].head())

Before adding 'FamilySize':
   SibSp  Parch
0      1      0
1      1      0
2      0      0
3      1      0
4      0      0

After adding 'FamilySize':
   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1


In [13]:
print("\nBefore extracting 'Title':")
print(data[['Name']].head())
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
print("\nAfter extracting 'Title':")
print(data[['Name', 'Title']].head())


Before extracting 'Title':
                                                Name
0                            Braund, Mr. Owen Harris
1  Cumings, Mrs. John Bradley (Florence Briggs Th...
2                             Heikkinen, Miss. Laina
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                           Allen, Mr. William Henry

After extracting 'Title':
                                                Name Title
0                            Braund, Mr. Owen Harris    Mr
1  Cumings, Mrs. John Bradley (Florence Briggs Th...   Mrs
2                             Heikkinen, Miss. Laina  Miss
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)   Mrs
4                           Allen, Mr. William Henry    Mr


In [14]:
print("\nBefore grouping 'Age' into categories:")
print(data[['Age']].head())
bins = [0, 12, 60, np.inf]
labels = ['Child', 'Adult', 'Senior']
data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)
print("\nAfter grouping 'Age' into categories:")
print(data[['Age', 'AgeGroup']].head())


Before grouping 'Age' into categories:
    Age
0  22.0
1  38.0
2  26.0
3  35.0
4  35.0

After grouping 'Age' into categories:
    Age AgeGroup
0  22.0    Adult
1  38.0    Adult
2  26.0    Adult
3  35.0    Adult
4  35.0    Adult


In [15]:
print("\nBefore label encoding 'Sex' and 'Embarked':")
print(data[['Sex', 'Embarked']].head())
le_sex = LabelEncoder()
data['Sex'] = le_sex.fit_transform(data['Sex'])
le_embarked = LabelEncoder()
data['Embarked'] = le_embarked.fit_transform(data['Embarked'])
print("\nAfter label encoding 'Sex' and 'Embarked':")
print(data[['Sex', 'Embarked']].head())


Before label encoding 'Sex' and 'Embarked':
      Sex Embarked
0    male        S
1  female        C
2  female        S
3  female        S
4    male        S

After label encoding 'Sex' and 'Embarked':
   Sex  Embarked
0    1         2
1    0         0
2    0         2
3    0         2
4    1         2


In [16]:
# Display the first few rows of the updated dataset to verify the changes
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare Cabin  Embarked  FamilySize Title AgeGroup  
0         A/5 21171   7.2500   NaN         2           2    Mr    Adult  
1          PC 17599  71.2833   C85         0           2   Mrs    Adult  
2  STON/O2. 3101282   7.9250   NaN         2           1  Miss    Adult  
