## Building a Machine Learning PipeLine...

### Example:1 Applying Transformers on data columns..

In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder , StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("datasets/salary-package1.csv")
df

Unnamed: 0,gender,cgpa,iq,salary
0,Male,2.3,Medium,68
1,Female,3.8,High,100
2,Male,3.0,Medium,75
3,Female,,Low,48
4,Female,2.3,High,97
...,...,...,...,...
95,Female,2.1,Low,40
96,Female,2.3,High,68
97,Female,3.8,High,100
98,Female,3.0,Medium,75


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   gender  100 non-null    object 
 1   cgpa    90 non-null     float64
 2   iq      100 non-null    object 
 3   salary  100 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 3.2+ KB


In [4]:
df.isnull().sum()

gender     0
cgpa      10
iq         0
salary     0
dtype: int64

### Train Test Split...

In [5]:
df.gender.unique()

array(['Male', 'Female'], dtype=object)

In [6]:
df.iq.unique()

array(['Medium', 'High', 'Low'], dtype=object)

In [9]:
X = df.drop("salary", axis=1)
y = df['salary']
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, shuffle=False)
X_train.head()

Unnamed: 0,gender,cgpa,iq
0,Male,2.3,Medium
1,Female,3.8,High
2,Male,3.0,Medium
3,Female,,Low
4,Female,2.3,High


### SimpleImputer on CGPA.

In [15]:
si = SimpleImputer(missing_values=np.nan ,strategy='mean')
si.fit(X_train[['cgpa']])
X_train['cgpa'] = si.transform(X_train[['cgpa']])
X_test['cgpa'] = si.transform(X_test[['cgpa']])
X_train.head()

Unnamed: 0,gender,cgpa,iq
0,Male,2.3,Medium
1,Female,3.8,High
2,Male,3.0,Medium
3,Female,2.884507,Low
4,Female,2.3,High


In [18]:
X_train.isnull().sum()

gender    0
cgpa      0
iq        0
dtype: int64

### OneHotEncoding on Gender.

In [20]:
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)
ohe.fit(X_train[['gender']])
X_train['gender'] = ohe.transform(X_train[['gender']])
X_test['gender'] = ohe.transform(X_test[['gender']])
X_train.head()



Unnamed: 0,gender,cgpa,iq
0,1,2.3,Medium
1,0,3.8,High
2,1,3.0,Medium
3,0,2.884507,Low
4,0,2.3,High


### OrdinalEncoding on IQ.

In [21]:
oe = OrdinalEncoder(categories=[['Low', 'Medium', 'High']], dtype=np.int8)
oe.fit(X_train[['iq']])
X_train['iq'] = oe.transform(X_train[['iq']])
X_test['iq'] = oe.transform(X_test[['iq']])
X_train.head()

Unnamed: 0,gender,cgpa,iq
0,1,2.3,1
1,0,3.8,2
2,1,3.0,1
3,0,2.884507,0
4,0,2.3,2


### Model Creation..

In [23]:
lr = LinearRegression()
lr.fit(X_train, y_train)

### Prediction...

In [24]:
predict_value = np.array(['Male', 3.5, 'High']).reshape(1,3)
transformed_predict_value = np.array([1, 3.5, 2]).reshape(1,3)
lr.predict(transformed_predict_value)



array([73.12721804])

### Example:2 Using ColumnTransformer on examle:1...

In [36]:
df.head()
X = df.drop("salary", axis=1)
y = df['salary']
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, shuffle=False)
X_train.head()

Unnamed: 0,gender,cgpa,iq
0,Male,2.3,Medium
1,Female,3.8,High
2,Male,3.0,Medium
3,Female,,Low
4,Female,2.3,High


In [37]:
col_trans = ColumnTransformer(
                            transformers=[('OHE', OneHotEncoder(drop='first', sparse=False), ['gender']),
                                          ('SI', SimpleImputer(missing_values=np.nan ,strategy='mean'), ['cgpa']),
                                          ('OE', OrdinalEncoder(categories=[['Low', 'Medium', 'High']]) ,['iq'])
                                         ],
                            remainder='passthrough')

In [38]:
col_trans.fit(X_train)



In [40]:
arr_xTrain = col_trans.transform(X_train)
arr_Xtest = col_trans.transform(X_test)
arr_xTrain

array([[1.        , 2.3       , 1.        ],
       [0.        , 3.8       , 2.        ],
       [1.        , 3.        , 1.        ],
       [0.        , 2.88450704, 0.        ],
       [0.        , 2.3       , 2.        ],
       [1.        , 3.1       , 0.        ],
       [1.        , 2.4       , 2.        ],
       [0.        , 2.88450704, 1.        ],
       [0.        , 1.1       , 2.        ],
       [0.        , 3.2       , 0.        ],
       [0.        , 3.6       , 0.        ],
       [0.        , 4.        , 1.        ],
       [0.        , 3.1       , 2.        ],
       [1.        , 2.8       , 0.        ],
       [1.        , 2.5       , 0.        ],
       [1.        , 3.1       , 2.        ],
       [0.        , 2.3       , 0.        ],
       [0.        , 3.8       , 1.        ],
       [0.        , 3.        , 0.        ],
       [0.        , 2.88450704, 2.        ],
       [1.        , 2.3       , 2.        ],
       [1.        , 3.1       , 1.        ],
       [0.

In [41]:
pred_value = np.array(['Male', 3.5, 'High']).reshape(1,3)
transformed_pred_value = np.array([1, 3.5, 2]).reshape(1,3)
lr.predict(transformed_pred_value)



array([73.12721804])

## PIPELINE...

In [2]:
df1 = pd.read_csv("datasets/advertising-withmissingdata.csv")
df1

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,,45.1,10.4
2,,45.9,69.3,9.3
3,151.5,41.3,,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,,25.5


In [3]:
df1.isnull().sum()

TV            4
radio        10
newspaper     2
sales         0
dtype: int64

In [4]:
X = df1.drop("sales", axis=1)
y = df1['sales']
X_Train, X_Test, y_Train , y_Test = train_test_split(X , y, test_size=0.2, random_state=54)

### Create Pipeline Object for train and predict..

In [21]:
pipe = Pipeline(steps=[
                    ('si', SimpleImputer(missing_values=np.nan, strategy='mean')),
                    ('ss', StandardScaler()),
                    ('lr', LinearRegression())
                ]
                )
pipe

In [31]:
pipe.fit(X_Train, y_Train)

In [32]:
pipe.predict(X_Test)

array([11.5103185 , 10.74314548, 17.06532765,  6.81320042,  9.7687584 ,
       11.81989768, 19.25928657, 10.56907364, 15.59736803, 16.11914409,
        9.73165545, 14.03714358, 16.43272133, 17.27575117,  8.87968169,
       23.35102195,  8.20884078, 13.29275029,  9.79142007, 11.38200247,
       18.68712032, 12.0133993 , 18.87632571, 20.56500684, 14.01595916,
       13.54109255, 18.17737359, 24.17518038,  9.15739432, 14.75927552,
       19.98104419,  9.76168882, 19.52101244, 21.27322187, 18.19830953,
       20.92010821,  6.60271388, 15.34461646, 20.26487212,  6.14806508])

In [33]:
pipe.score(X_Test, y_Test)

0.850615741795994

### Specific Transformation...

In [34]:
pipe['si'].fit_tranform(X_Train)

AttributeError: 'SimpleImputer' object has no attribute 'fit_tranform'

In [35]:
tv=60
radio=np.nan
newsppr = 40
test_in = np.array([[tv, radio, newsppr]])
test_in

array([[60., nan, 40.]])

In [36]:
pipe.predict(test_in)



array([10.00509133])

## ColumnTransformer in a PipeLine...

In [37]:
tips = pd.read_csv("datasets/tips-missingdata.csv")
tips

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip
0,16.99,Female,No,Sun,Dinner,2,
1,,Male,,Sun,Dinner,3,1.66
2,21.01,Male,No,Sun,Dinner,3,3.50
3,23.68,,No,Sun,Dinner,2,3.31
4,24.59,Female,No,Sun,Dinner,4,3.61
...,...,...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner,3,5.92
240,27.18,Female,Yes,Sat,Dinner,2,2.00
241,22.67,Male,Yes,Sat,Dinner,2,2.00
242,,Male,No,Sat,Dinner,2,1.75


In [38]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  241 non-null    float64
 1   sex         242 non-null    object 
 2   smoker      243 non-null    object 
 3   day         244 non-null    object 
 4   time        244 non-null    object 
 5   size        244 non-null    int64  
 6   tip         243 non-null    float64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [39]:
tips.day.unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [40]:
tips.isnull().sum() # for rows having missing output label we just drop them

total_bill    3
sex           2
smoker        1
day           0
time          0
size          0
tip           1
dtype: int64

### Dropping missing Output Label Rows...

In [41]:
tips.dropna(axis=0, how='any', subset=['tip'], inplace=True)
tips.isnull().sum() 

total_bill    3
sex           2
smoker        1
day           0
time          0
size          0
tip           0
dtype: int64

In [42]:
tipX = tips.drop('tip', axis=1)
tipy = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(tipX, tipy, test_size=0.2, random_state=54)


### Model Pipeline...

In [43]:
categorical_transformer = Pipeline(steps=
                                    [
                                        ('si', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'))
                                    ]
                                   )
numeric_transformer = Pipeline(
                                    [
                                        ('si', SimpleImputer(missing_values=np.nan, strategy='mean'))
                                    ]
                                   )

#### Combine above pipelines using Column Transformer

In [44]:
preprocessor = ColumnTransformer(transformers=
                                     [
                                         ('categorical', categorical_transformer, [1,2,3,4]),
                                         ('numeric', numeric_transformer, [0,5])
                                     ]
                                 )

In [45]:
model_pipe = Pipeline(steps=
                      [
                          ('preprocessor', preprocessor),
                          ('standard', StandardScaler()),
                          ('Estimator', LinearRegression())
                      ]
                     )
model_pipe

### Train and Evaluate PipeLine Object...

In [46]:
model_pipe.fit(X_train, y_train)

In [47]:
model_pipe.score(X_test, y_test)

0.3919649062043291

In [48]:
model_pipe.predict(np.array([[100, 'Male', 'Yes','Sun', 'Dinner',4]]))



array([10.34508622])

### Cross Validate...

In [50]:
cross_val_score(model_pipe, X_train, y_train, cv=5, scoring='r2')

array([0.33120287, 0.3581081 , 0.57774671, 0.32420228, 0.47500525])

In [51]:
cross_val_score(model_pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.4132530407042143