# Data Preprocessing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [None]:
dataset_train = pd.read_csv("train.csv")
X_train = dataset_train.drop(["Survived"], axis=1)
y_train = dataset_train["Survived"]
X_test = pd.read_csv("test.csv")


## Identify numeric and categorical columns

In [3]:
numeric_features = ["Age", "SibSp", "Parch", "Fare"]
categorical_features = ["Pclass", "Embarked", "Sex"]


## Preprocessing for numeric data

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)


## Preprocessing for categorical data

In [5]:
from sklearn.preprocessing import OneHotEncoder

categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ("scaler", StandardScaler()),
    ]
)


## Combine preprocessors in a column transformer

In [6]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)
