In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
df = pd.read_csv('titanic.csv')  # Adjust filename if needed

print("Dataset shape:", df.shape)
print("\nColumn info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

Dataset shape: (418, 12)

Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB
None

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare        

In [3]:
# Data cleaning and preprocessing

def clean_titanic_data(df):

    data = df.copy() # Creating a copy to avoid modifying the original dataset

    # Handling missing values

    # Age: Filling the 86 missing values with median age by Pclass and Sex
    age_median = data.groupby(['Pclass', 'Sex'])['Age'].median()
    for pclass in [1, 2, 3]:
        for sex in ['male', 'female']:
            mask = (data['Pclass'] == pclass) & (data['Sex'] == sex) & (data['Age'].isnull())
            if mask.sum() > 0:  # Only fill if there are missing values for this group
                median_age = age_median.get((pclass, sex))
                if pd.notna(median_age):
                    data.loc[mask, 'Age'] = median_age
                else:
                    # Fallback to overall median for this group
                    data.loc[mask, 'Age'] = data['Age'].median()

    # Fare: Filling the 1 missing value with median fare by Pclass
    if data['Fare'].isnull().sum() > 0:
        fare_median = data.groupby('Pclass')['Fare'].median()
        for pclass in [1, 2, 3]:
            mask = (data['Pclass'] == pclass) & (data['Fare'].isnull())
            if mask.sum() > 0:
                data.loc[mask, 'Fare'] = fare_median[pclass]

    # Cabin: Extracting deck information and creating binary feature for cabin availability
    # Since 327 out of 418 values are missing, we'll create features from available data
    data['HasCabin'] = data['Cabin'].notna().astype(int)

    # Extract deck letter from cabin (first character)
    data['Deck'] = data['Cabin'].str[0]
    data['Deck'].fillna('Unknown', inplace=True)



    # Feature engineering

    # Creating family size feature
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

    # Creating is_alone feature
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

    # Extracting the title from Name
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

    # Grouping rare titles
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
        'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
        'Capt': 'Rare', 'Sir': 'Rare'
    }
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'].fillna('Rare', inplace=True)

    # Creating age groups
    data['AgeGroup'] = pd.cut(data['Age'], bins=[0, 12, 18, 35, 60, 100],
                             labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])

    # Creating fare groups
    data['FareGroup'] = pd.qcut(data['Fare'], q=4, labels=['Low', 'Medium', 'High', 'Very_High'])



    # Encoding categorical variables

    # Binary encoding for Sex
    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

    # Label encoding for Embarked
    le_embarked = LabelEncoder()
    data['Embarked'] = le_embarked.fit_transform(data['Embarked'])

    # One-hot encoding for Title
    title_dummies = pd.get_dummies(data['Title'], prefix='Title')
    data = pd.concat([data, title_dummies], axis=1)

    # One-hot encoding for AgeGroup
    age_dummies = pd.get_dummies(data['AgeGroup'], prefix='AgeGroup')
    data = pd.concat([data, age_dummies], axis=1)

    # One-hot encoding for FareGroup
    fare_dummies = pd.get_dummies(data['FareGroup'], prefix='FareGroup')
    data = pd.concat([data, fare_dummies], axis=1)

    # One-hot encoding for Deck
    deck_dummies = pd.get_dummies(data['Deck'], prefix='Deck')
    data = pd.concat([data, deck_dummies], axis=1)

    # 4. Drop unnecessary columns
    columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Title', 'AgeGroup', 'FareGroup', 'Deck']
    data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])

    return data


In [4]:
# Clean data
cleaned_df = clean_titanic_data(df)

print("\nCleaned dataset shape:", cleaned_df.shape)
print("\nMissing values after cleaning:")
missing_after = cleaned_df.isnull().sum()
print(missing_after)
print(f"\nTotal missing values: {missing_after.sum()}")
print("\nFinal columns:")
print(cleaned_df.columns.tolist())

# Verify the cleaning worked for the specific missing values
print(f"\nVerification:")
print(f"Age missing before: 86, after: {cleaned_df['Age'].isnull().sum()}")
print(f"Fare missing before: 1, after: {cleaned_df['Fare'].isnull().sum()}")
print(f"Cabin missing before: 327 (now converted to HasCabin and Deck features)")



Cleaned dataset shape: (418, 33)

Missing values after cleaning:
Survived               0
Pclass                 0
Sex                    0
Age                    0
SibSp                  0
Parch                  0
Fare                   0
Embarked               0
HasCabin               0
FamilySize             0
IsAlone                0
Title_Master           0
Title_Miss             0
Title_Mr               0
Title_Mrs              0
Title_Rare             0
AgeGroup_Child         0
AgeGroup_Teen          0
AgeGroup_Adult         0
AgeGroup_Middle        0
AgeGroup_Senior        0
FareGroup_Low          0
FareGroup_Medium       0
FareGroup_High         0
FareGroup_Very_High    0
Deck_A                 0
Deck_B                 0
Deck_C                 0
Deck_D                 0
Deck_E                 0
Deck_F                 0
Deck_G                 0
Deck_Unknown           0
dtype: int64

Total missing values: 0

Final columns:
['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Deck'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Title'].fillna('Rare', inplace=True)


In [5]:
# Preparing features and target
X = cleaned_df.drop('Survived', axis=1)
y = cleaned_df['Survived']

# Spliting into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Train samples: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test samples: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")



Train set shape: (334, 32)
Test set shape: (84, 32)
Train samples: 334 (79.9%)
Test samples: 84 (20.1%)


In [6]:
# Save the cleaned datasets
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('titanic_train_cleaned.csv', index=False)
test_data.to_csv('titanic_test_cleaned.csv', index=False)
cleaned_df.to_csv('titanic_full_cleaned.csv', index=False)

print("\nFiles saved:")
print("- titanic_train_cleaned.csv")
print("- titanic_test_cleaned.csv")
print("- titanic_full_cleaned.csv")



Files saved:
- titanic_train_cleaned.csv
- titanic_test_cleaned.csv
- titanic_full_cleaned.csv


In [7]:
# Show survival distribution
print(f"\nSurvival distribution:")
survival_counts = y.value_counts()
print(f"Not survived (0): {survival_counts[0]} ({survival_counts[0]/len(y)*100:.1f}%)")
print(f"Survived (1): {survival_counts[1]} ({survival_counts[1]/len(y)*100:.1f}%)")



Survival distribution:
Not survived (0): 266 (63.6%)
Survived (1): 152 (36.4%)


In [8]:
# Display basic statistics
print("\nDataset summary:")
print(cleaned_df.describe())


Dataset summary:
         Survived      Pclass         Sex         Age       SibSp       Parch  \
count  418.000000  418.000000  418.000000  418.000000  418.000000  418.000000   
mean     0.363636    2.265550    0.363636   29.297847    0.447368    0.392344   
std      0.481622    0.841838    0.481622   13.038856    0.896760    0.981429   
min      0.000000    1.000000    0.000000    0.170000    0.000000    0.000000   
25%      0.000000    1.000000    0.000000   22.000000    0.000000    0.000000   
50%      0.000000    3.000000    0.000000   25.000000    0.000000    0.000000   
75%      1.000000    3.000000    1.000000   36.375000    1.000000    0.000000   
max      1.000000    3.000000    1.000000   76.000000    8.000000    9.000000   

             Fare    Embarked    HasCabin  FamilySize     IsAlone  
count  418.000000  418.000000  418.000000  418.000000  418.000000  
mean    35.560845    1.401914    0.217703    1.839713    0.605263  
std     55.856972    0.854496    0.413179    1.5

In [9]:
# Correlation with survival
print("\nTop correlations with Survival:")
correlation = cleaned_df.corr()['Survived'].abs().sort_values(ascending=False)
print(correlation.head(10))


Top correlations with Survival:
Survived               1.000000
Sex                    1.000000
Title_Mr               0.877762
Title_Miss             0.638606
Title_Mrs              0.603458
IsAlone                0.244187
FareGroup_Very_High    0.192853
Fare                   0.192225
Title_Master           0.173858
FamilySize             0.161803
Name: Survived, dtype: float64
