In [None]:
import pandas as pd    #load the data, statistics
import seaborn as sns   #visualize the data

from sklearn.preprocessing import LabelEncoder, OneHotEncoder #nominal, categorical
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


train_df = pd.read_csv('input/train.csv')

# Creating new categories for age
train_df['<10 yrs'] = train_df['Age'].apply(lambda x: 1 if x < 10 else 0)
train_df['>60 yrs'] = train_df['Age'].apply(lambda x: 1 if x > 60 else 0)

selected_columns = train_df.drop(['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)
print(selected_columns)

#1 is male, 2 is female
labelEncoder = LabelEncoder()
selected_columns['Sex'] = labelEncoder.fit_transform(selected_columns['Sex'])

# Define the features (X) and target (y)
X = selected_columns[['Sex', 'Pclass', '<10 yrs', '>60 yrs']]
y = selected_columns['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

preprocessor = ColumnTransformer(
    transformers=[
        ('pclass', OneHotEncoder(drop='first'), ['Pclass'])
    ], remainder='passthrough')

X_transformed = preprocessor.fit_transform(X)
pclass_feature_names = preprocessor.named_transformers_['pclass'].get_feature_names_out(['Pclass'])
all_feature_names = list(pclass_feature_names) + ['Sex', '<10 yrs', '>60 yrs']

# Convert the transformed data back into a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=all_feature_names)
print(X_transformed_df)