In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import tensorflow as tf

In [None]:
df = sns.load_dataset("titanic")
print("First 5 rows of dataset:\n", df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values per column:\n", df.isnull().sum())


First 5 rows of dataset:
    survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       ------------

In [None]:
X = df.drop(columns=["survived"])
y = df["survived"]


In [None]:
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("\nCategorical Features:", categorical_features)
print("Numerical Features:", numerical_features)



Categorical Features: ['sex', 'embarked', 'class', 'who', 'deck', 'embark_town', 'alive']
Numerical Features: ['pclass', 'age', 'sibsp', 'parch', 'fare']


In [None]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [None]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numerical_features),
        ("cat", cat_pipeline, categorical_features)
    ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nShapes before preprocessing:")
print("X_train:", X_train.shape, "X_test:", X_test.shape)



Shapes before preprocessing:
X_train: (712, 14) X_test: (179, 14)


In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed  = preprocessor.transform(X_test)

print("\nShapes after preprocessing:")
print("X_train_processed:", X_train_processed.shape)
print("X_test_processed:", X_test_processed.shape)


Shapes after preprocessing:
X_train_processed: (712, 28)
X_test_processed: (179, 28)


In [None]:
X_train_tensor = tf.convert_to_tensor(X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed, dtype=tf.float32)
X_test_tensor  = tf.convert_to_tensor(X_test_processed.toarray() if hasattr(X_test_processed, "toarray") else X_test_processed, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train.values, dtype=tf.int32)
y_test_tensor  = tf.convert_to_tensor(y_test.values, dtype=tf.int32)

print("\nTensor shapes:")
print("X_train_tensor:", X_train_tensor.shape)
print("y_train_tensor:", y_train_tensor.shape)


Tensor shapes:
X_train_tensor: (712, 28)
y_train_tensor: (712,)
