# Feature Engineering

This notebook aims to work with feature from the train spaceship tatinic dataset. 

The objective is create a column pipeline with sklearn feature.

Reading the data

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/spaceshit-titanic/train.csv')

X, y = data.drop('Transported', axis=1), data['Transported']


In [3]:
# Get the numeric columns
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Special fields
passenger_id_col = 'PassengerId'
cabin_col = 'Cabin'

# To drop
to_drop = ['Name']

In [4]:
X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


Tools

To create a custom transformer:
- FunctionTransformer: Constructs a transformer from an arbitrary callable.

To apply a transformer to a specific column:
- ColumnTransformer: Applies transformers to columns of an array or pandas DataFrame.

To apply multiple transformers to the same column:
- FeatureUnion: Concatenate results of multiple transformer objects.

To create a pipeline of transformers and a final estimator:
- Pipeline: Pipeline of transforms with a final estimator.

### Tests, dev

In [5]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

one_hot = OneHotEncoder(sparse_output=False)

ordinal = OrdinalEncoder()

ex = X['HomePlanet'].loc[[6028, 6030, 6037, 6039, 6044, 6054, 6055, 6056]]

one_hot.fit_transform(ex.values.reshape(-1, 1))

ordinal.fit_transform(ex.values.reshape(-1, 1))



array([[ 1.],
       [nan],
       [ 2.],
       [ 0.],
       [ 0.],
       [nan],
       [ 0.],
       [nan]])

In [6]:
# Test the KNNImputer
from sklearn.impute import KNNImputer

knn = KNNImputer(n_neighbors=2, weights='uniform')

X_ = np.array([[ 1.],
       [np.nan],
       [ 2.],
       [ 1.],
       [ 0.],
       [np.nan],
       [ 0.],
       [np.nan]])

X_, knn.fit_transform(X_)

# Transform discrete values into numeric values

(array([[ 1.],
        [nan],
        [ 2.],
        [ 1.],
        [ 0.],
        [nan],
        [ 0.],
        [nan]]),
 array([[1. ],
        [0.8],
        [2. ],
        [1. ],
        [0. ],
        [0.8],
        [0. ],
        [0.8]]))

In [7]:
# Test the IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

interative = IterativeImputer()

X_ = np.array([[ 1.],
       [np.nan],
       [ 2.],
       [ 0.],
       [ 0.],
       [np.nan],
       [ 0.],
       [np.nan]])

X_, interative.fit_transform(X_)


(array([[ 1.],
        [nan],
        [ 2.],
        [ 0.],
        [ 0.],
        [nan],
        [ 0.],
        [nan]]),
 array([[1. ],
        [0.6],
        [2. ],
        [0. ],
        [0. ],
        [0.6],
        [0. ],
        [0.6]]))

In [8]:
# For numeric columns
# Simple imputer to median
# https://scikit-learn.org/stable/modules/impute.html#
import numpy as np
from sklearn.impute import SimpleImputer
# Pipilines
from sklearn.compose import ColumnTransformer
# Categorical columns. Try the KNN
from sklearn.impute import KNNImputer



numeric_imputer = SimpleImputer(strategy=np.median, keep_empty_features=True) #type: ignore

column_transformer = ColumnTransformer(
    [
        ('numeric_imputer', numeric_imputer, numeric_columns)
    ],
    remainder='passthrough'
)

categorical_imputer = SimpleImputer(strategy='constant', keep_empty_features=True) #type: ignore

categorical_imputer.fit_transform(X)

pd.DataFrame(categorical_imputer.fit_transform(X), columns=X.columns)[X.isnull().any(axis=1)].iloc[50:100]

# X[X['PassengerId'].str.startswith('0012')]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
222,0239_01,Mars,False,missing_value,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty
224,0241_01,Europa,missing_value,E/11/P,55 Cancri e,33.0,False,0.0,1249.0,0.0,4812.0,1116.0,Alas Dischod
225,0242_01,missing_value,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté
227,0244_01,Mars,True,missing_value,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad
228,0244_02,Mars,True,F/47/S,55 Cancri e,14.0,missing_value,0.0,0.0,0.0,0.0,0.0,Tous Sad
233,0250_01,Earth,True,G/38/S,PSO J318.5-22,47.0,False,missing_value,0.0,0.0,0.0,0.0,Camily Kramosley
234,0251_01,missing_value,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive
239,0257_01,Mars,False,F/49/S,55 Cancri e,21.0,False,1664.0,0.0,missing_value,0.0,0.0,Errohs Berte
245,0265_01,Europa,True,D/8/S,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,missing_value,0.0,Etair Herpumble
249,0275_01,Europa,True,D/9/S,55 Cancri e,missing_value,False,0.0,0.0,0.0,0.0,0.0,Terf Obnoble


### FunctionTransformer

In [9]:
# Feature engineering of passengerid and cabin columns
from sklearn.preprocessing import FunctionTransformer
# Pipilines
from sklearn.compose import ColumnTransformer

# Function
def passenger_id_spliter(passenger_id_col: pd.Series) -> pd.DataFrame:
    '''Function to split the passenger id into two columns

    Args:
        passenger_id_col: pd.Series - The passenger id column
    
    Returns:
        pd.DataFrame - The dataframe with the two columns (Group, PassengerId)
    '''

    # Split the passenger id
    # Numpy method
    # splited = passenger_id_col.to_numpy()
    # splited = np.array([x.split('_') for x in splited])

    # Pandas method    
    df = (
        pd.DataFrame(
            passenger_id_col
            .str
            .split('_')
            .to_list(),
        columns=['Group', 'PassengerId']
        )
        .drop('PassengerId', axis=1)
    )
    return df

# Transformer
passenger_id_transformer = FunctionTransformer(func=passenger_id_spliter, 
                            feature_names_out=lambda _, __: np.array(['Group']),)

# Column transformer
column_transformer = ColumnTransformer(
    transformers=[
        ('passenger_id', passenger_id_transformer, passenger_id_col)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# Fit and transform
column_transformer.fit(X)
column_transformer.transform(X)
# column_transformer.get_feature_names_out()
# Test the transformer


array([['0001'],
       ['0002'],
       ['0003'],
       ...,
       ['9279'],
       ['9280'],
       ['9280']], dtype=object)

In [10]:


(
    X.Cabin.str.split('/', expand=True)
    .rename(columns={0: 'Cabin1', 1: 'Cabin2'})
)

Unnamed: 0,Cabin1,Cabin2,2
0,B,0,P
1,F,0,S
2,A,0,S
3,A,0,S
4,F,1,S
...,...,...,...
8688,A,98,P
8689,G,1499,S
8690,G,1500,S
8691,E,608,S


In [11]:
def cabin_spliter(cabin_col: pd.Series) -> pd.DataFrame:
    '''Function to split the cabin into two columns

    Args:
        cabin_col: pd.Series - The cabin column
    
    Returns:
        pd.DataFrame - The dataframe with the two columns (Cabin, CabinNumber)
    '''
    print(cabin_col)
    df = (
        cabin_col
        .str
        .split('/', expand=True)
        .rename(columns={0: 'Deck', 1: 'Num', 2: 'Side'})
        .drop('Num', axis=1)
    )
    
    # df = (
    #     pd.DataFrame(
    #         cabin_col
    #         .str
    #         .split('/')
    #         .to_list(),
    #     columns=['Deck', 'Num', 'Side']
    #     )
    #     .drop('Num', axis=1)
    # )
    print(df)
    return df

cabin_transformer = FunctionTransformer(func=cabin_spliter,
    feature_names_out=lambda _, __: np.array(['Deck', 'Side'])
)

# Column transformer
column_transformer = ColumnTransformer(
    transformers=[
        ('cabin', cabin_transformer, 'Cabin')
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# Fit and transform
column_transformer.fit(X)
column_transformer.transform(X)

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object
     Deck Side
0       B    P
1       F    S
2       A    S
3       A    S
4       F    S
...   ...  ...
8688    A    P
8689    G    S
8690    G    S
8691    E    S
8692    E    S

[8693 rows x 2 columns]
0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object
     Deck Side
0       B    P
1       F    S
2       A    S
3       A    S
4       F    S
...   ...  ...
8688    A    P
8689    G    S
8690    G    S
8691    E    S
8692    E    S

[8693 rows x 2 columns]


array([['B', 'P'],
       ['F', 'S'],
       ['A', 'S'],
       ...,
       ['G', 'S'],
       ['E', 'S'],
       ['E', 'S']], dtype=object)

In [12]:
X.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

### Analysis of KNNImputer

In [13]:
# Get a dataset with right encoders to make imputer possible
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

columns_to_encode = ['VIP', 'CryoSleep', 'HomePlanet', 'Destination']
text_columns = X.select_dtypes(include=['object']).columns

transfomer = ColumnTransformer(
    transformers=[
        ('name_drop', 'drop', 'Name'),
        ('encoder', OrdinalEncoder(), columns_to_encode),
        ('text_columns', 'drop', text_columns)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False, #type: ignore
    verbose_feature_names_out=False
)

# Fit and transform
transfomer.fit(X)

X_encoded = transfomer.transform(X)

# Get the column names
columns = transfomer.get_feature_names_out()

# Create a dataframe
X_encoded = pd.DataFrame(X_encoded, columns=columns)

X_encoded

Unnamed: 0,VIP,CryoSleep,HomePlanet,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.0,0.0,1.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,2.0,24.0,109.0,9.0,25.0,549.0,44.0
2,1.0,0.0,1.0,2.0,58.0,43.0,3576.0,0.0,6715.0,49.0
3,0.0,0.0,1.0,2.0,33.0,0.0,1283.0,371.0,3329.0,193.0
4,0.0,0.0,0.0,2.0,16.0,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...
8688,1.0,0.0,1.0,0.0,41.0,0.0,6819.0,0.0,1643.0,74.0
8689,0.0,1.0,0.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0
8690,0.0,0.0,0.0,2.0,26.0,0.0,0.0,1872.0,1.0,0.0
8691,0.0,0.0,1.0,0.0,32.0,0.0,1049.0,0.0,353.0,3235.0


In [14]:
with_missing_values = X_encoded[(X.iloc[:, :4].isnull().sum(axis=1) > 0)]

from sklearn.impute import KNNImputer
knn = KNNImputer(n_neighbors=10, weights='uniform')

# Checking the missing values
X_encoded.loc[with_missing_values.index]

# Fit and transform
knn.fit(X_encoded)

X_encoded_without = knn.transform(X_encoded)
X_encoded_without = pd.DataFrame(X_encoded_without, columns=knn.get_feature_names_out())

# Checking the missing values
X_encoded_without.loc[with_missing_values.index]

Unnamed: 0,VIP,CryoSleep,HomePlanet,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
15,0.0,0.0,0.0,2.0,31.0,32.0,0.0,876.0,0.0,0.0
59,0.0,1.0,1.1,2.0,33.0,0.0,0.0,0.0,0.0,0.0
92,0.0,0.8,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
93,0.0,1.0,2.0,2.0,31.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.6,0.0,2.0,27.0,0.0,0.0,570.0,2.0,131.0
...,...,...,...,...,...,...,...,...,...,...
8666,0.0,0.0,0.4,0.0,38.0,28.0,1208.0,973.0,207.0,0.0
8674,0.0,0.0,0.3,2.0,13.0,39.0,0.0,1085.0,24.0,0.0
8675,0.0,0.0,0.0,2.0,44.0,1030.0,1015.0,0.0,11.0,91.4
8684,0.0,1.0,1.4,2.0,23.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X_encoded.loc[with_missing_values.index]

Unnamed: 0,VIP,CryoSleep,HomePlanet,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
15,0.0,0.0,0.0,2.0,31.0,32.0,0.0,876.0,0.0,0.0
59,0.0,1.0,,2.0,33.0,0.0,0.0,,0.0,0.0
92,0.0,,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
93,0.0,1.0,2.0,2.0,31.0,0.0,0.0,0.0,0.0,0.0
98,0.0,,0.0,2.0,27.0,0.0,0.0,570.0,2.0,131.0
...,...,...,...,...,...,...,...,...,...,...
8666,,0.0,,0.0,38.0,28.0,1208.0,973.0,207.0,0.0
8674,0.0,0.0,,2.0,13.0,39.0,0.0,1085.0,24.0,0.0
8675,0.0,,0.0,2.0,44.0,1030.0,1015.0,0.0,11.0,
8684,0.0,1.0,,2.0,23.0,0.0,0.0,0.0,0.0,0.0


In [16]:
round(X_encoded.describe(), 2)

Unnamed: 0,VIP,CryoSleep,HomePlanet,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8490.0,8476.0,8492.0,8511.0,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,0.02,0.36,0.67,1.48,28.83,224.69,458.08,173.73,311.14,304.85
std,0.15,0.48,0.8,0.82,14.49,666.72,1611.49,604.7,1136.71,1145.72
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,19.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,2.0,27.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,2.0,38.0,47.0,76.0,27.0,59.0,46.0
max,1.0,1.0,2.0,2.0,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [17]:
round(X_encoded_without.describe(), 2)

Unnamed: 0,VIP,CryoSleep,HomePlanet,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,0.02,0.36,0.67,1.48,28.77,223.0,452.84,172.41,308.39,301.59
std,0.15,0.48,0.79,0.81,14.38,660.95,1596.24,598.3,1126.0,1134.4
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,19.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,2.0,27.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,1.0,2.0,37.0,54.0,82.0,31.0,65.0,52.0
max,1.0,1.0,2.0,2.0,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [18]:
X_encoded.VIP.value_counts(), X_encoded_without.VIP.value_counts(), \
X_encoded.CryoSleep.value_counts(), X_encoded_without.CryoSleep.value_counts(), \
X_encoded.HomePlanet.value_counts(), X_encoded_without.HomePlanet.value_counts()

(VIP
 0.0    8291
 1.0     199
 Name: count, dtype: int64,
 VIP
 0.0    8470
 1.0     199
 0.1      18
 0.2       6
 Name: count, dtype: int64,
 CryoSleep
 0.0    5439
 1.0    3037
 Name: count, dtype: int64,
 CryoSleep
 0.0    5465
 1.0    3086
 0.1      41
 0.9      40
 0.8      19
 0.3      13
 0.7      11
 0.4       7
 0.6       6
 0.2       3
 0.5       2
 Name: count, dtype: int64,
 HomePlanet
 0.0    4602
 1.0    2131
 2.0    1759
 Name: count, dtype: int64,
 HomePlanet
 0.0    4611
 1.0    2152
 2.0    1759
 0.8      25
 0.9      20
 0.5      20
 0.7      19
 0.2      16
 0.3      13
 0.4      13
 0.1      10
 0.6      10
 1.1       7
 1.4       7
 1.3       6
 1.5       2
 1.2       2
 1.7       1
 Name: count, dtype: int64)

In [19]:
# About CryoSleep
X_encoded_without[X_encoded_without.CryoSleep == 0.4]

Unnamed: 0,VIP,CryoSleep,HomePlanet,Destination,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
2189,0.0,0.4,0.0,0.0,28.0,613.0,22.0,0.0,0.0,0.0
2231,0.0,0.4,0.0,2.0,24.0,0.0,0.0,334.2,59.0,4120.0
3807,0.0,0.4,1.0,0.0,18.0,0.0,30.0,0.0,67.0,2790.0
4768,0.0,0.4,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
6428,0.0,0.4,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
7218,0.0,0.4,0.8,2.0,24.0,0.0,82.0,0.0,1624.0,77.0
8420,0.0,0.4,0.0,1.0,27.0,32.0,8.0,5.0,588.0,18.0


### Putting all together

In [60]:
import mlflow

mlflow.set_tracking_uri('http://localhost:8000')

# mlflow.create_experiment('spaceshit-titanic')

mlflow.set_experiment('spaceshit-titanic')

<Experiment: artifact_location='mlflow-artifacts:/272950282396396239', creation_time=1733566581603, experiment_id='272950282396396239', last_update_time=1733566581603, lifecycle_stage='active', name='spaceshit-titanic', tags={}>

In [20]:
X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [64]:


# Get a dataset with right encoders to make imputer possible
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline   import Pipeline 

# Defining the columns paths
columns_to_encode = ['VIP', 'CryoSleep', 'HomePlanet', 'Destination', 'Group', 'Deck', 'Side']
# columns_to_encode_index = 
text_columns = X.select_dtypes(include=['object']).columns

# Transformer
knn_imputer = KNNImputer(n_neighbors=10, weights='uniform')
encoder = OrdinalEncoder()
one_hot = OneHotEncoder()

# FunctionTransformers
# Function
def passenger_id_spliter(passenger_id_col: pd.Series) -> pd.DataFrame:
    '''Function to split the passenger id into two columns

    Args:
        passenger_id_col: pd.Series - The passenger id column
    
    Returns:
        pd.DataFrame - The dataframe with the two columns (Group, PassengerId)
    '''
    df = (
        pd.DataFrame(
            passenger_id_col
            .str
            .split('_')
            .to_list(),
        columns=['Group', 'PassengerId']
        )
        .drop('PassengerId', axis=1)
    )
    return df

def cabin_spliter(cabin_col: pd.Series) -> pd.DataFrame:
    '''Function to split the cabin into two columns

    Args:
        cabin_col: pd.Series - The cabin column
    
    Returns:
        pd.DataFrame - The dataframe with the two columns (Cabin, CabinNumber)
    '''
    df = (
        cabin_col
        .str
        .split('/', expand=True)
        .rename(columns={0: 'Deck', 1: 'Num', 2: 'Side'})
        # .drop('Num', axis=1)
    )
    
    return df

# Transformer
passenger_id_transformer = FunctionTransformer(func=passenger_id_spliter,
    feature_names_out=lambda _, __: np.array(['Group'])
)

cabin_transformer = FunctionTransformer(func=cabin_spliter,
    feature_names_out=lambda _, __: np.array(['Deck', 'Num', 'Side'])
)

# Transformer pipeline
transfomer = ColumnTransformer(
    transformers=[
        ('name_drop', 'drop', 'Name'),
        ('passenger_id', passenger_id_transformer, 'PassengerId'),
        ('cabin', cabin_transformer, 'Cabin'),
        # ('encoder', encoder, columns_to_encode)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False, #type: ignore
    verbose_feature_names_out=True
)

# columns_to_encode = [0,1]

encoder_transfomer = ColumnTransformer(
    transformers=[
        ('encoder', encoder, columns_to_encode)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False, #type: ignore
    verbose_feature_names_out=False
)

pipe = Pipeline([
    ('transformer', transfomer),
    ('encoder', encoder_transfomer)
    # ('imputer', knn_imputer)
])

import datetime
with mlflow.start_run(run_name=f"spaceshit-titanic-{datetime.datetime.now()}"):
    mlflow.log_param('n_neighbors', 10)
    # # Fit and transform
    transfomer.fit(X)

    X_transformed_ = transfomer.transform(X)

    # Get the column names
    columns = transfomer.get_feature_names_out()

    # Create a dataframe
    X_transformed = pd.DataFrame(X_transformed_, columns=columns)


X_transformed_

# pipe.fit_transform(X)

🏃 View run spaceshit-titanic-2024-12-07 11:49:35.437796 at: http://localhost:8000/#/experiments/272950282396396239/runs/0e486d59ff2f4034bb885d83ba415d21
🧪 View experiment at: http://localhost:8000/#/experiments/272950282396396239


array([['0001', 'B', '0', ..., 0.0, 0.0, 0.0],
       ['0002', 'F', '0', ..., 25.0, 549.0, 44.0],
       ['0003', 'A', '0', ..., 0.0, 6715.0, 49.0],
       ...,
       ['9279', 'G', '1500', ..., 1872.0, 1.0, 0.0],
       ['9280', 'E', '608', ..., 0.0, 353.0, 3235.0],
       ['9280', 'E', '608', ..., 0.0, 0.0, 12.0]], dtype=object)

In [56]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

pca.fit(X.select_dtypes(include=['int64', 'float64']).fillna(0))

