<a href="https://colab.research.google.com/github/MadeehaTassadaq/MadeehaTassadaq/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [133]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [134]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [135]:
df=pd.read_csv('/content/customer.csv')

In [136]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [137]:
# train test split the data
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['purchased'],axis=1),df['purchased'],test_size=0.2,random_state=42)


In [138]:
X_train.head()

Unnamed: 0,age,gender,review,education
12,51,Male,Poor,School
4,16,Female,Average,UG
37,94,Male,Average,PG
8,65,Female,Average,UG
3,72,Female,Good,PG


In [139]:
# one hot encode the gender column
trf1=ColumnTransformer([
    ('ohe',OneHotEncoder(sparse=False,drop='first',dtype=np.int32),[1])
],remainder='passthrough')


In [140]:
# ordinal encode the review and education
trf2=ColumnTransformer([
    ('ordinal',OrdinalEncoder(dtype=np.int32),[2,3])
],remainder='passthrough')

In [141]:
# feature selection
trf3=SelectKBest(chi2,k=2)

In [142]:
# apply algorithmn
trf4=DecisionTreeClassifier()

In [143]:
from os import pipe
pipe=Pipeline(
    [('trf1',trf1),
        ('trf2',trf2),
        ('trf3',trf3),
        ('trf4',trf4)
    ]
)

In [144]:
# display pipeline
from sklearn import set_config
set_config(display='diagram')

In [145]:
# alternate syntax
pipe=make_pipeline(trf1,trf2,trf3,trf4)

In [146]:
# train the pipeline
pipe.fit(X_train,y_train)

In [147]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe',
                                  OneHotEncoder(drop='first',
                                                dtype=<class 'numpy.int32'>,
                                                sparse=False),
                                  [1])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ordinal',
                                  OrdinalEncoder(dtype=<class 'numpy.int32'>),
                                  [2, 3])]),
 'selectkbest': SelectKBest(k=2, score_func=<function chi2 at 0x7a5402f76170>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [148]:
pipe.named_steps['columntransformer-1'].transformers_[0][1].categories_

[array(['Female', 'Male'], dtype=object)]

In [149]:
pipe.named_steps['columntransformer-2'].transformers_[0][1].categories_

[array(['Average', 'Good', 'Poor'], dtype=object),
 array(['PG', 'School', 'UG'], dtype=object)]

In [150]:
y_pred=pipe.predict(X_test)

In [151]:
y_pred

array(['Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes'],
      dtype=object)

In [152]:
accuracy_score(y_test,y_pred)

0.3

### Cross validation using Pipeline

In [153]:
from sklearn.model_selection import cross_val_score

In [154]:
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

0.6

### Grid Search CV

In [155]:
# gridSearchCV
from sklearn.model_selection import GridSearchCV
params={
    'trf4__max_depth':[1,2,3,4,5,None]
}
grid=GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

ValueError: Invalid parameter 'trf4' for estimator Pipeline(steps=[('columntransformer-1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                dtype=<class 'numpy.int32'>,
                                                                sparse=False),
                                                  [1])])),
                ('columntransformer-2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinal',
                                                  OrdinalEncoder(dtype=<class 'numpy.int32'>),
                                                  [2, 3])])),
                ('selectkbest',
                 SelectKBest(k=2,
                             score_func=<function chi2 at 0x7a5402f76170>)),
                ('decisiontreeclassifier', DecisionTreeClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [None]:
grid.best_score_

In [None]:
grid.best_params_

### Exporting the pipeline

In [None]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))