#### PIPELINE


`A machine learning pipeline is a series of interconnected steps that process data, transform it, train a model, evaluate its performance, and deploy it for predictions. It streamlines the workflow, automates tasks, and ensures consistency. By using pipelines, you can easily experiment, reproduce results, and scale the process efficiently.`

In [1]:
import pandas as pd

df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
df = df.loc[df.Embarked.notna(),["Survived","Pclass","Sex","Embarked"]]
df.head()
# only those rows are chosen here embarked is non null and the above columns are chosen

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [4]:
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Embarked    0
dtype: int64

In [5]:
X = df.loc[:,["Pclass"]]
y = df.Survived

# even if single feature is present in X it has to be 2 dimensional unlike y which could be one dimensional

In [6]:
X.shape

(889, 1)

In [7]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [8]:
from sklearn.model_selection import cross_val_score

cross_val_score(logreg,X,y,cv=5,scoring="accuracy").mean()
# cross validating the score

0.6783406335301212

`Dummy Encoding or One Hot Encoding`

In [9]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [10]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)

In [11]:
ohe.fit_transform(df[["Embarked"]])

# this is the dummy encoding of the sex column

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [12]:
ohe.categories_

# this is better than dummy encoding in pandas

[array(['C', 'Q', 'S'], dtype=object)]

In [13]:
X = df.drop("Survived",axis=1)
X.head()

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S


In [14]:
from sklearn.compose import make_column_transformer

# column transformer is used whenever we have features in dataframe that need different preprocessing
# like here we wnt to apply one hot encoding to sex and embarkes but not to pclass

In [15]:
column_trans = make_column_transformer((OneHotEncoder(),["Sex","Embarked"]),remainder="passthrough")
column_trans.fit_transform(X)

array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

In [16]:
from sklearn.pipeline import make_pipeline
# it is for chaining steps together

pipeline = make_pipeline(column_trans, logreg)

In [17]:
cross_val_score(pipeline,X,y,cv=5,scoring="accuracy").mean()

0.7727924839713071

In [18]:
X_new = X.sample(5,random_state=99)
X_new

Unnamed: 0,Pclass,Sex,Embarked
599,1,male,C
512,1,male,S
273,1,male,C
215,1,female,C
790,3,male,Q


In [19]:
pipeline.fit(X,y)

In [20]:
pipeline.predict(X_new)

# this is similar to model.predict except the fact that its implementing the full co

array([1, 0, 1, 1, 0])