In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Load the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')


In [60]:
# Inspect columns
print("Train columns:", train.columns)
print("Test columns:", test.columns)


Train columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Test columns: Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [61]:
# Define feature and target variable for train dataset
X_train = train.drop(columns=['Survived'])
y_train = train['Survived']


In [62]:
# For test dataset
X_test = test.copy()


In [63]:
# Custom transformer for feature engineering
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Create new features
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
        X['IsAlone'] = np.where(X['FamilySize'] == 1, 1, 0)
        X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        X['Title'] = X['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                         'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                         'Jonkheer', 'Dona'], 'Rare')
        X['Title'] = X['Title'].replace('Mlle', 'Miss')
        X['Title'] = X['Title'].replace('Ms', 'Miss')
        X['Title'] = X['Title'].replace('Mme', 'Mrs')
        
        # Drop columns not needed anymore
        X = X.drop(columns=['Name'])
        
        return X

In [64]:
# Handle missing values and encode categorical features
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Embarked', 'Sex', 'Pclass', 'Title']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [65]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [66]:
# Create preprocessing and feature engineering pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineer()),
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

# Convert the preprocessed data back to DataFrame for better readability
X_train = pd.DataFrame(X_train, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

print(X_train.head())
print(X_test.head())

   num__Age  num__Fare  cat__Embarked_C  cat__Embarked_Q  cat__Embarked_S  \
0 -0.565736  -0.502445              0.0              0.0              1.0   
1  0.663861   0.786845              1.0              0.0              0.0   
2 -0.258337  -0.488854              0.0              0.0              1.0   
3  0.433312   0.420730              0.0              0.0              1.0   
4  0.433312  -0.486337              0.0              0.0              1.0   

   cat__Sex_female  cat__Sex_male  cat__Pclass_1  cat__Pclass_2  \
0              0.0            1.0            0.0            0.0   
1              1.0            0.0            1.0            0.0   
2              1.0            0.0            0.0            0.0   
3              1.0            0.0            1.0            0.0   
4              0.0            1.0            0.0            0.0   

   cat__Pclass_3  cat__Title_Master  cat__Title_Miss  cat__Title_Mr  \
0            1.0                0.0              0.0           

In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Load the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')

# Inspect columns
print("Train columns:", train.columns)
print("Test columns:", test.columns)

# Define feature and target variable for train dataset
X_train = train.drop(columns=['Survived'])
y_train = train['Survived']

# For test dataset
X_test = test.copy()

# Custom transformer for feature engineering
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Create new features
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
        X['IsAlone'] = np.where(X['FamilySize'] == 1, 1, 0)
        X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        X['Title'] = X['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                         'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                         'Jonkheer', 'Dona'], 'Rare')
        X['Title'] = X['Title'].replace('Mlle', 'Miss')
        X['Title'] = X['Title'].replace('Ms', 'Miss')
        X['Title'] = X['Title'].replace('Mme', 'Mrs')
        
        # Drop columns not needed anymore
        X = X.drop(columns=['Name'])
        
        return X

# Handle missing values and encode categorical features
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Embarked', 'Sex', 'Pclass', 'Title']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create preprocessing and feature engineering pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', FeatureEngineer()),
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

# Convert the preprocessed data back to DataFrame for better readability
X_train = pd.DataFrame(X_train, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

print(X_train.head())
print(X_test.head())

# The data is now ready for predictive modeling


Train columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Test columns: Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   num__Age  num__Fare  cat__Embarked_C  cat__Embarked_Q  cat__Embarked_S  \
0 -0.565736  -0.502445              0.0              0.0              1.0   
1  0.663861   0.786845              1.0              0.0              0.0   
2 -0.258337  -0.488854              0.0              0.0              1.0   
3  0.433312   0.420730              0.0              0.0              1.0   
4  0.433312  -0.486337              0.0              0.0              1.0   

   cat__Sex_female  cat__Sex_male  cat__Pclass_1  cat__Pclass_2  \
0              0.0            1.0            0.0            0.0   
1              1.0            0.0            1.0            0.0   
2            