In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from aquarel import load_theme


# Using the Aquarel library with some customization for the plots
theme = load_theme("boxy_dark")
theme.set_color(figure_background_color="#181818",
                plot_background_color="#242424")
theme.set_font(family="monospace", size=9)
theme.apply_transforms()
theme.apply()


df = pd.read_csv("Data/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["Survived"])
Y = df["Survived"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, shuffle=True, test_size=0.1, random_state=42)

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


# A custom transformer to make all the extra features
class FeatureEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.age_bins = [0, 3, 12, 20, 60, 200]
        self.age_labels = ["toddler", "child", "teen", "adult", "senior"]
        self.q25 = 7.775
        self.q50 = 8.85
        self.q75 = 24.2882
        self.fare_bins = []
        self.fare_labels = ["VeryLow", "Low", "Medium", "High"]
                
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # Extracting the title
        X["Title"] = X["Name"].apply(lambda x: x.split(", ")[1].split(".")[0])
        # Grouping the titles
        X["Title"] = X["Title"].apply(lambda x: x if x in ["Mr", "Miss", "Mrs", "Master"] else "Rare")
        X["IsTitleRare"] = X["Title"].apply(lambda x: 1 if x=="Rare" else 0)
        
        # Defining a new feature - age group
        X["Age"].fillna(X["Age"].median(), inplace=True)
        X["AgeGroup"] = pd.cut(X["Age"], bins=self.age_bins, labels=self.age_labels)
        
        # Person + family members on board
        X["FamilySize"] = X["SibSp"] + X["Parch"] + 1
        
        # Calculate adjusted fare (ticket price per person)
        X["Fare"].fillna(X["Fare"].median(), inplace=True)
        X["TicketCount"] = X["Ticket"].map(X["Ticket"].value_counts())
        X["AdjustedFare"] = round(X["Fare"] / X["TicketCount"], 4)
        
        # Replacing Fares = 0 with median and adding log fare
        X["AdjustedFare"].replace(0, X["AdjustedFare"].median(), inplace=True)
        X["LogFare"] = np.log(X['AdjustedFare'])
        
        # Extracting wealth from fare
        self.fare_bins = [0, self.q25, self.q50, self.q75, X["AdjustedFare"].max()]
        X["Wealth"] = pd.cut(X["AdjustedFare"], bins=self.fare_bins, labels=self.fare_labels)
        
        # Extracting deck of residance for each passanger
        X["Deck"] = X["Cabin"].str[0]
        X["Deck"] = X["Deck"].fillna("N/A")
        
        # Check if passenger has cabin
        X["HasCabin"] = X["Deck"].apply(lambda x: 0 if x=="N/A" else 1)
        
        # Check if passenger is traveling solo
        X["IsAlone"] = X["FamilySize"].apply(lambda x: 1 if x==1 else 0)

        return X

In [26]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


num_features = ['Age', 'LogFare', 'FamilySize']
cat_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck', 'AgeGroup', 'Wealth']
drop_features = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# The all-in-one preprocessing pipeline
preprocessing = Pipeline([
    ('feature_engineering', FeatureEngineering()),
    ('preprocessor', ColumnTransformer([
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features),
        ('dropper', 'drop', drop_features)
    ]))
])

In [27]:
x_train_transformed = preprocessing.fit_transform(x_train)
x_train_transformed

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["Age"].fillna(X["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X["Fare"].fillna(X["Fare"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

array([[-1.56203526, -0.87782487,  0.66627464, ...,  0.        ,
         0.        ,  1.        ],
       [-1.56203526, -1.45509864,  3.10598203, ...,  0.        ,
         0.        ,  1.        ],
       [ 2.42028858,  1.22562759, -0.55357905, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.88862556,  0.10262865,  0.66627464, ...,  0.        ,
         1.        ,  0.        ],
       [-1.17911951,  1.51441295,  1.27620149, ...,  0.        ,
         0.        ,  0.        ],
       [-0.64303745,  1.46768417,  0.0563478 , ...,  0.        ,
         0.        ,  0.        ]])