In [5]:
import os 
import pandas as pd

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
gender_submission_df = pd.read_csv('gender_submission.csv')

# Display the first few rows of each dataset
print("Train Dataset:")
print(train_df.head())

print("\nTest Dataset:")
print(test_df.head())

print("\nGender Submission Dataset:")
print(gender_submission_df.head())

Train Dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   Na

In [6]:
# Check the structure and summary of the train dataset
print(train_df.info())
print(train_df.describe())

# Check for missing values in train dataset
print("\nMissing Values in Train Dataset:")
print(train_df.isnull().sum())

# Repeat for test dataset
print("\nTest Dataset:")
print(test_df.info())
print(test_df.describe())

print("\nMissing Values in Test Dataset:")
print(test_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.48659

In [7]:
# Fill missing Age values with the median age
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

# Fill missing Embarked values with the mode (most common value)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# For Cabin, we can either drop the column or create a new feature 'Has_Cabin' indicating presence of Cabin info
train_df['Has_Cabin'] = train_df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
test_df['Has_Cabin'] = test_df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)

# Drop the Cabin column if not needed
train_df.drop(columns=['Cabin'], inplace=True)
test_df.drop(columns=['Cabin'], inplace=True)

# Check that missing values are handled
print(train_df.isnull().sum())
print(test_df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Has_Cabin      0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
Has_Cabin      0
dtype: int64


In [8]:
# Create a new feature FamilySize
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# Extract titles from names
train_df['Title'] = train_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

# Map the titles to a more simplified and consistent format
title_mapping = {
    "Mr": "Mr", "Miss": "Miss", "Mrs": "Mrs", 
    "Master": "Master", "Dr": "Officer", "Rev": "Officer",
    "Col": "Officer", "Major": "Officer", "Mlle": "Miss", 
    "Countess": "Royalty", "Ms": "Miss", "Lady": "Royalty", 
    "Jonkheer": "Royalty", "Don": "Royalty", "Dona": "Royalty", 
    "Mme": "Mrs", "Capt": "Officer", "Sir": "Royalty"
}
train_df['Title'] = train_df['Title'].map(title_mapping)
test_df['Title'] = test_df['Title'].map(title_mapping)

In [9]:
# Encode categorical features
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

# Align columns in train and test datasets (to handle any discrepancies)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

In [10]:
# Separate features and target variable from the training dataset
X_train = train_df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Survived'])
y_train = train_df['Survived']

# For the test set, drop irrelevant columns and keep the PassengerId for submission
X_test = test_df.drop(columns=['PassengerId', 'Name', 'Ticket'])

In [11]:
# Save the cleaned training data
train_df.to_csv('cleaned_train.csv', index=False)

# Save the cleaned test data
test_df.to_csv('cleaned_test.csv', index=False)

# Save features (X_train)
X_train.to_csv('X_train.csv', index=False)

# Save target variable (y_train)
y_train.to_csv('y_train.csv', index=False)

# Save test features (X_test)
X_test.to_csv('X_test.csv', index=False)


During the data cleaning process for the Titanic dataset, I dealt with missing values, created new features, and converted categories into numbers. Missing ages were filled in with the median age, and missing embarkation points were filled in with the most common one. Since many cabin numbers were missing, I created a new feature to indicate whether a cabin number was listed and then removed the original cabin column. I also made a new feature for family size by combining existing columns and pulled out titles like "Mr." or "Mrs." from the name column. Finally, I turned categories like gender and embarkation point into numbers and made sure the training and test data had the same format for modeling.