# Pipeline Concept
### A machine learning pipeline is a structured workflow that automates and streamlines the process of building, training, evaluating, and deploying machine learning models. It's essentially an end-to-end system that manages the flow of data into and out of a model, including steps like data collection, preprocessing, feature engineering, model training, evaluation, and deployment. 

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
# load data
titanic = sns.load_dataset('titanic')

In [5]:
# Select feature and target variable
X =  titanic[['pclass','sex','age','fare','embarked']]
y = titanic['survived']

In [9]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [10]:
# feature categorization
num_feat = ['age','fare']
cat_feat = ['pclass', 'sex', 'embarked']

In [20]:
# create pipeline transformer for numerical and categorical
num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [21]:
# define preprocessor
prepros = ColumnTransformer(
    transformers=[
        ('num', num_trans, num_feat),
        ('cat', cat_trans, cat_feat)
    ]
)

In [22]:
# define pipeline now
pipeline = Pipeline(steps=[
    ('preprocessor', prepros),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [23]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

In [24]:
# make prediction
y_pred = pipeline.predict(X_test)
#calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy', accuracy)

Accuracy 0.7821229050279329


In [None]:
# hyper parameter tunning in Pipeline
