In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

#Data Cleaning
from sklearn.feature_extraction.text import CountVectorizer

#Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#Splitting Data
from sklearn.model_selection import train_test_split

#modeling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from xgboost.sklearn import XGBClassifier

# RAW DATASET

In [3]:
data=pd.read_excel('coffee-house-satisfactory-survey.xlsx')
pd.set_option('display.max_columns',None)#to display all columns

In [4]:
data.head()

Unnamed: 0,Timestamp,1. Your Gender,2. Your Age,3. Are you currently....?,4. What is your annual income?,5. How often do you visit Coffee House?,6. How do you usually enjoy Coffee House?,7. How much time do you normally spend during your visit?,8. The nearest Coffee House's outlet to you is...?,9. Do you have Coffee House membership card?,10. What do you most frequently purchase at Coffee House?,"11. On average, how much would you spend at Coffee House per visit?","12. How would you rate the quality of Coffee House compared to other brands (Coffee Bean, Old Town White Coffee..) to be:",13. How would you rate the price range at Coffee House?,14. How important are sales and promotions in your purchase decision?,"15. How would you rate the ambiance at Coffee House? (lighting, music, etc...)",16. You rate the WiFi quality at Coffee House as..,"17. How would you rate the service at Coffee House? (Promptness, friendliness, etc..)",18. How likely you will choose Coffee House for doing business meetings or hangout with friends?,19. How do you come to hear of promotions at Coffee House? Check all that apply.,20. Will you continue buying Coffee House?
0,2019/10/01 12:38:43 PM GMT+8,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,within 1km,Yes,Coffee,Less than RM20,4,3,5,5,4,4,3,Starbucks Website/Apps;Social Media;Emails;Dea...,Yes
1,2019/10/01 12:38:54 PM GMT+8,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,1km - 3km,Yes,Cold drinks;Pastries,Less than RM20,4,3,4,4,4,5,2,Social Media;In Store displays,Yes
2,2019/10/01 12:38:56 PM GMT+8,Male,From 20 to 29,Employed,"Less than RM25,000",Monthly,Dine in,Between 30 minutes to 1 hour,more than 3km,Yes,Coffee,Less than RM20,4,3,4,4,4,4,3,In Store displays;Billboards,Yes
3,2019/10/01 12:39:08 PM GMT+8,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,more than 3km,No,Coffee,Less than RM20,2,1,4,3,3,3,3,Through friends and word of mouth,No
4,2019/10/01 12:39:20 PM GMT+8,Male,From 20 to 29,Student,"Less than RM25,000",Monthly,Take away,Between 30 minutes to 1 hour,1km - 3km,No,Coffee;Sandwiches,Around RM20 - RM40,3,3,4,2,2,3,3,Starbucks Website/Apps;Social Media,Yes


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 21 columns):
 #   Column                                                                                                                     Non-Null Count  Dtype 
---  ------                                                                                                                     --------------  ----- 
 0   Timestamp                                                                                                                  122 non-null    object
 1   1. Your Gender                                                                                                             122 non-null    object
 2   2. Your Age                                                                                                                122 non-null    object
 3   3. Are you currently....?                                                                                                  122 non-null

# DATA CLEANING

In [6]:
df=data.copy()

Drop timestamp column since it seems irrelevent

In [7]:
df.drop('Timestamp',axis=1,inplace=True)

In [8]:
# renaming the columns inorder to make the header readable

In [9]:
df.columns=df.columns.str.replace("'","").str.replace(" ","")

In [10]:
df.rename(columns={'1.YourGender':'Gender','2.YourAge':'Age','3.Areyoucurrently....?':'occupation','4.Whatisyourannualincome?':'Income','5.HowoftendoyouvisitCoffeeHouse?':'Visit_Frequency','6.HowdoyouusuallyenjoyCoffeeHouse?':'Service_Type','7.Howmuchtimedoyounormallyspendduringyourvisit?':'Time_spend','8.ThenearestCoffeeHousesoutlettoyouis...?':'Distance','9.DoyouhaveCoffeeHousemembershipcard?':'Membership','10.WhatdoyoumostfrequentlypurchaseatCoffeeHouse?':'Product_consumed','11.Onaverage,howmuchwouldyouspendatCoffeeHousepervisit?':'Hours_spended_Per_Visit','12.HowwouldyouratethequalityofCoffeeHousecomparedtootherbrands(CoffeeBean,OldTownWhiteCoffee..)tobe:':'Quality_Rating','13.HowwouldyouratethepricerangeatCoffeeHouse?':'price_range_rating','14.Howimportantaresalesandpromotionsinyourpurchasedecision?':'sales','15.HowwouldyouratetheambianceatCoffeeHouse?(lighting,music,etc...)':'Ambiance_Rating','16.YouratetheWiFiqualityatCoffeeHouseas..':'Wifi_Rating','17.HowwouldyouratetheserviceatCoffeeHouse?(Promptness,friendliness,etc..)':'Service_Rating','18.HowlikelyyouwillchooseCoffeeHousefordoingbusinessmeetingsorhangoutwithfriends?':'Preference','19.HowdoyoucometohearofpromotionsatCoffeeHouse?Checkallthatapply.':'Promotion Source','20.WillyoucontinuebuyingCoffeeHouse?':'Future Purchases','Hours_spended':'Hours_spended_Per_Visit'},inplace=True)

In [11]:
df.columns

Index(['Gender', 'Age', 'occupation', 'Income', 'Visit_Frequency',
       'Service_Type', 'Time_spend', 'Distance', 'Membership',
       'Product_consumed', 'Hours_spended_Per_Visit', 'Quality_Rating',
       'price_range_rating', 'sales', 'Ambiance_Rating', 'Wifi_Rating',
       'Service_Rating', 'Preference', 'Promotion Source', 'Future Purchases'],
      dtype='object')

In [12]:
#checking for columns with null value
df.columns[df.isnull().any()]

Index(['Service_Type', 'Promotion Source'], dtype='object')

In [13]:
df[df['Service_Type'].isnull()]

Unnamed: 0,Gender,Age,occupation,Income,Visit_Frequency,Service_Type,Time_spend,Distance,Membership,Product_consumed,Hours_spended_Per_Visit,Quality_Rating,price_range_rating,sales,Ambiance_Rating,Wifi_Rating,Service_Rating,Preference,Promotion Source,Future Purchases
81,Male,From 20 to 29,Employed,"Less than RM25,000",Never,,Below 30 minutes,more than 3km,No,Never buy any,Zero,1,1,1,3,3,3,3,,No


In [14]:
df[df['Promotion Source'].isnull()]

Unnamed: 0,Gender,Age,occupation,Income,Visit_Frequency,Service_Type,Time_spend,Distance,Membership,Product_consumed,Hours_spended_Per_Visit,Quality_Rating,price_range_rating,sales,Ambiance_Rating,Wifi_Rating,Service_Rating,Preference,Promotion Source,Future Purchases
81,Male,From 20 to 29,Employed,"Less than RM25,000",Never,,Below 30 minutes,more than 3km,No,Never buy any,Zero,1,1,1,3,3,3,3,,No


In [15]:
desired_column_order=['Gender', 'Age', 'occupation', 'Income', 'Visit_Frequency',
       'Service_Type', 'Time_spend', 'Distance', 'Membership',
       'Product_consumed', 'Hours_spended_Per_Visit', 'Quality_Rating',
       'price_range_rating', 'sales', 'Ambiance_Rating', 'Wifi_Rating',
       'Service_Rating', 'Preference', 'Promotion Source', 'Future Purchases'
       ]

In [16]:
df=df[desired_column_order]

In [17]:
df.head()

Unnamed: 0,Gender,Age,occupation,Income,Visit_Frequency,Service_Type,Time_spend,Distance,Membership,Product_consumed,Hours_spended_Per_Visit,Quality_Rating,price_range_rating,sales,Ambiance_Rating,Wifi_Rating,Service_Rating,Preference,Promotion Source,Future Purchases
0,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,within 1km,Yes,Coffee,Less than RM20,4,3,5,5,4,4,3,Starbucks Website/Apps;Social Media;Emails;Dea...,Yes
1,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,1km - 3km,Yes,Cold drinks;Pastries,Less than RM20,4,3,4,4,4,5,2,Social Media;In Store displays,Yes
2,Male,From 20 to 29,Employed,"Less than RM25,000",Monthly,Dine in,Between 30 minutes to 1 hour,more than 3km,Yes,Coffee,Less than RM20,4,3,4,4,4,4,3,In Store displays;Billboards,Yes
3,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,more than 3km,No,Coffee,Less than RM20,2,1,4,3,3,3,3,Through friends and word of mouth,No
4,Male,From 20 to 29,Student,"Less than RM25,000",Monthly,Take away,Between 30 minutes to 1 hour,1km - 3km,No,Coffee;Sandwiches,Around RM20 - RM40,3,3,4,2,2,3,3,Starbucks Website/Apps;Social Media,Yes


In [17]:
df['Service_Type'].unique()

array(['Dine in', 'Take away', 'Drive-thru', 'never', 'Never buy', nan,
       'I dont like coffee', 'Never'], dtype=object)

In [18]:
df['Service_Type'].replace(['never','Never','I dont like coffee','Never buy'],'Never buy',inplace=True)

In [19]:
df['Service_Type'].value_counts()

Take away     49
Dine in       46
Drive-thru    20
Never buy      6
Name: Service_Type, dtype: int64

Product_consumed

In [20]:
df['Product_consumed'].value_counts()

Coffee                                           65
Cold drinks                                      25
Coffee;Cold drinks                                6
Coffee;Pastries                                   6
Coffee;Sandwiches                                 3
Cold drinks;Juices;Pastries                       2
Pastries                                          2
Jaws chip                                         1
Nothing                                           1
never                                             1
Cold drinks;Never                                 1
cake                                              1
Coffee;Cold drinks;Pastries;Sandwiches            1
Never buy any                                     1
Never                                             1
Cold drinks;Pastries                              1
Coffee;Pastries;Sandwiches                        1
Coffee;Juices;Pastries;Sandwiches                 1
Cold drinks;Pastries;Sandwiches                   1
Coffee;Cold 

In [21]:
cv=CountVectorizer(tokenizer=lambda x:x.split(';'))
product=cv.fit_transform(df['Product_consumed'])
print(cv.get_feature_names())
consumed_product=pd.DataFrame(product.toarray(),columns=cv.get_feature_names())

['cake', 'coffee', 'cold drinks', 'jaws chip', 'juices', 'never', 'never buy any', 'nothing', 'pastries', 'sandwiches']


In [22]:
consumed_product=pd.DataFrame(product.toarray(),columns=cv.get_feature_names())

In [23]:
consumed_product

Unnamed: 0,cake,coffee,cold drinks,jaws chip,juices,never,never buy any,nothing,pastries,sandwiches
0,0,1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
117,0,1,0,0,0,0,0,0,0,0
118,0,1,1,0,1,0,0,0,1,1
119,0,1,1,0,0,0,0,0,0,0
120,0,1,0,0,0,0,0,0,0,0


In [24]:
consumed_product['Product Never buy']=consumed_product.iloc[:,-6:-3].sum(axis=1)
consumed_product.drop(['never','never buy any','nothing'],inplace=True,axis=1)

In [25]:
df=pd.concat([df,consumed_product],axis=1)

In [26]:
df.drop('Product_consumed',inplace=True,axis=1)

Promo

In [27]:
df['Promotion Source'].value_counts()

Social Media                                                                                                                                   31
Social Media;Through friends and word of mouth                                                                                                 14
Through friends and word of mouth                                                                                                               9
Starbucks Website/Apps;Social Media                                                                                                             9
In Store displays                                                                                                                               7
Starbucks Website/Apps;Social Media;Emails;Through friends and word of mouth                                                                    6
Starbucks Website/Apps;Social Media;Through friends and word of mouth                                                       

In [28]:
df['Promotion Source'].fillna('Social Media', inplace = True)

In [29]:
cv=CountVectorizer(tokenizer=lambda x:x.split(';'))
promo=cv.fit_transform(df['Promotion Source'])
print(cv.get_feature_names())

['application offer', 'billboards', 'deal sites (fave, iprice, etc...)', 'emails', 'in store displays', 'never hear', 'social media', 'starbucks website/apps', 'through friends and word of mouth']


In [30]:
promo=pd.DataFrame(promo.toarray(),columns=cv.get_feature_names())


In [31]:
promo.rename({'application offer':'promo_application','billboards':'promo_billboards','deal sites (fave, iprice, etc...)':'promo_dealsites','emails':'promo_emails','in store displays':'promo_displays','never hear':'promo_never_heard','social media':'promo_social_media','starbucks website/apps':'promo_website_apps','through friends and word of mouth':'promo_word_of_mouth'},inplace=True,axis=1)

In [32]:
promo

Unnamed: 0,promo_application,promo_billboards,promo_dealsites,promo_emails,promo_displays,promo_never_heard,promo_social_media,promo_website_apps,promo_word_of_mouth
0,0,0,1,1,0,0,1,1,0
1,0,0,0,0,1,0,1,0,0
2,0,1,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
117,0,0,0,0,0,0,1,1,0
118,0,1,1,1,1,0,1,1,1
119,0,0,0,0,1,0,1,0,1
120,0,1,0,0,0,0,1,0,1


In [33]:
df=pd.concat([df,promo],axis=1)

In [34]:
df.drop('Promotion Source',inplace=True,axis=1)

In [35]:
df

Unnamed: 0,Gender,Age,occupation,Income,Visit_Frequency,Service_Type,Time_spend,Distance,Membership,Hours_spended_Per_Visit,Quality_Rating,price_range_rating,sales,Ambiance_Rating,Wifi_Rating,Service_Rating,Preference,Future Purchases,cake,coffee,cold drinks,jaws chip,juices,pastries,sandwiches,Product Never buy,promo_application,promo_billboards,promo_dealsites,promo_emails,promo_displays,promo_never_heard,promo_social_media,promo_website_apps,promo_word_of_mouth
0,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,within 1km,Yes,Less than RM20,4,3,5,5,4,4,3,Yes,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0
1,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,1km - 3km,Yes,Less than RM20,4,3,4,4,4,5,2,Yes,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0
2,Male,From 20 to 29,Employed,"Less than RM25,000",Monthly,Dine in,Between 30 minutes to 1 hour,more than 3km,Yes,Less than RM20,4,3,4,4,4,4,3,Yes,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,more than 3km,No,Less than RM20,2,1,4,3,3,3,3,No,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,Male,From 20 to 29,Student,"Less than RM25,000",Monthly,Take away,Between 30 minutes to 1 hour,1km - 3km,No,Around RM20 - RM40,3,3,4,2,2,3,3,Yes,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Male,40 and above,Self-employed,"RM25,000 - RM50,000",Monthly,Dine in,Between 1 hour to 2 hours,1km - 3km,Yes,Around RM20 - RM40,3,3,5,3,2,4,4,Yes,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
118,Male,From 20 to 29,Employed,"Less than RM25,000",Monthly,Dine in,Between 1 hour to 2 hours,1km - 3km,Yes,More than RM40,5,5,5,5,5,5,5,Yes,0,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1
119,Male,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,1km - 3km,No,Less than RM20,3,2,4,3,3,3,4,No,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1
120,Female,From 20 to 29,Employed,"Less than RM25,000",Rarely,Take away,Below 30 minutes,within 1km,No,Less than RM20,4,4,4,4,4,4,4,Yes,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1


# Preprocessing

In [36]:
mode_onehot_pipe = Pipeline([
    ('encoder', SimpleImputer(strategy = 'most_frequent')),
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'))])

transformer = ColumnTransformer([
    ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'), ['Gender','Age','occupation','Income','Visit_Frequency','Service_Type','Time_spend','Distance','Membership','Hours_spended_Per_Visit']),
    ('mode_onehot_pipe', mode_onehot_pipe, ['Service_Type']),
], remainder = 'passthrough')

In [38]:
df

Unnamed: 0,Gender,Age,occupation,Income,Visit_Frequency,Service_Type,Time_spend,Distance,Membership,Hours_spended_Per_Visit,Quality_Rating,price_range_rating,sales,Ambiance_Rating,Wifi_Rating,Service_Rating,Preference,Future Purchases,cake,coffee,cold drinks,jaws chip,juices,pastries,sandwiches,Product Never buy,promo_application,promo_billboards,promo_dealsites,promo_emails,promo_displays,promo_never_heard,promo_social_media,promo_website_apps,promo_word_of_mouth
0,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,within 1km,Yes,Less than RM20,4,3,5,5,4,4,3,Yes,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0
1,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,1km - 3km,Yes,Less than RM20,4,3,4,4,4,5,2,Yes,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0
2,Male,From 20 to 29,Employed,"Less than RM25,000",Monthly,Dine in,Between 30 minutes to 1 hour,more than 3km,Yes,Less than RM20,4,3,4,4,4,4,3,Yes,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,Female,From 20 to 29,Student,"Less than RM25,000",Rarely,Take away,Below 30 minutes,more than 3km,No,Less than RM20,2,1,4,3,3,3,3,No,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,Male,From 20 to 29,Student,"Less than RM25,000",Monthly,Take away,Between 30 minutes to 1 hour,1km - 3km,No,Around RM20 - RM40,3,3,4,2,2,3,3,Yes,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Male,40 and above,Self-employed,"RM25,000 - RM50,000",Monthly,Dine in,Between 1 hour to 2 hours,1km - 3km,Yes,Around RM20 - RM40,3,3,5,3,2,4,4,Yes,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
118,Male,From 20 to 29,Employed,"Less than RM25,000",Monthly,Dine in,Between 1 hour to 2 hours,1km - 3km,Yes,More than RM40,5,5,5,5,5,5,5,Yes,0,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1
119,Male,From 20 to 29,Student,"Less than RM25,000",Rarely,Dine in,Between 30 minutes to 1 hour,1km - 3km,No,Less than RM20,3,2,4,3,3,3,4,No,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1
120,Female,From 20 to 29,Employed,"Less than RM25,000",Rarely,Take away,Below 30 minutes,within 1km,No,Less than RM20,4,4,4,4,4,4,4,Yes,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1


In [39]:
df['Future Purchases']=df['Future Purchases'].astype(str)

In [40]:
df['Future Purchases']

0      Yes
1      Yes
2      Yes
3       No
4      Yes
      ... 
117    Yes
118    Yes
119     No
120    Yes
121     No
Name: Future Purchases, Length: 122, dtype: object

In [41]:
df['Future Purchases']=np.where(df['Future Purchases'] == 'Yes', 1, 0)
y=df['Future Purchases']

In [42]:
df['Future Purchases'].value_counts()

1    94
0    28
Name: Future Purchases, dtype: int64

In [43]:
x=df.drop('Future Purchases',axis=1)


In [44]:
x.shape

(122, 34)

In [45]:
y.shape

(122,)

In [46]:
X_train,X_test,y_train,y_test=train_test_split(x,y, stratify = y,test_size=0.3,random_state=3434)

In [47]:
x.columns

Index(['Gender', 'Age', 'occupation', 'Income', 'Visit_Frequency',
       'Service_Type', 'Time_spend', 'Distance', 'Membership',
       'Hours_spended_Per_Visit', 'Quality_Rating', 'price_range_rating',
       'sales', 'Ambiance_Rating', 'Wifi_Rating', 'Service_Rating',
       'Preference', 'cake', 'coffee', 'cold drinks', 'jaws chip', 'juices',
       'pastries', 'sandwiches', 'Product Never buy', 'promo_application',
       'promo_billboards', 'promo_dealsites', 'promo_emails', 'promo_displays',
       'promo_never_heard', 'promo_social_media', 'promo_website_apps',
       'promo_word_of_mouth'],
      dtype='object')

In [48]:
log = LogisticRegression(random_state = 3434)
tree = DecisionTreeClassifier(random_state = 3434)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state =3434)
svc = LinearSVC(random_state =3434)
ada = AdaBoostClassifier(random_state =3434)
grad = GradientBoostingClassifier(random_state =3434)
xgb = XGBClassifier(verbosity = 0, random_state =3434)

In [49]:
logreg_pipe = Pipeline([('transformer', transformer), ('log', log)])
tree_pipe = Pipeline([('transformer', transformer), ('tree', tree)])
knn_pipe = Pipeline([('transformer', transformer), ('knn', knn)])
rf_pipe = Pipeline([('transformer', transformer), ('rf', rf)])
svc_pipe = Pipeline([('transformer', transformer), ('svc', svc)])
ada_pipe = Pipeline([('transformer', transformer), ('ada', ada)])
grad_pipe = Pipeline([('transformer', transformer), ('grad', grad)])
xgb_pipe = Pipeline([('transformer', transformer), ('xgb', xgb)])


In [50]:
for model in [logreg_pipe, tree_pipe, knn_pipe, rf_pipe, svc_pipe, ada_pipe, grad_pipe, xgb_pipe]:
    model.fit(X_train,y_train)

score_acc = [accuracy_score(y_test, logreg_pipe.predict(X_test)),
             accuracy_score(y_test, tree_pipe.predict(X_test)),
             accuracy_score(y_test, knn_pipe.predict(X_test)),
             accuracy_score(y_test, rf_pipe.predict(X_test)),
             accuracy_score(y_test, svc_pipe.predict(X_test)),
             accuracy_score(y_test, ada_pipe.predict(X_test)),
             accuracy_score(y_test, grad_pipe.predict(X_test)),
             accuracy_score(y_test, xgb_pipe.predict(X_test))]

method_name = ['Logistic Regression', 'Decision Tree Classifier', 'KNN Classifier', 'Random Forest Classifier', 'LinearSVC', 'AdaBoost Classifier', 'Gradient Boosting Classifier', 'XGB Classifier']

acc_summary = pd.DataFrame({'method': method_name, 'accuracy score': score_acc})


In [51]:
acc_summary 

Unnamed: 0,method,accuracy score
0,Logistic Regression,0.864865
1,Decision Tree Classifier,0.756757
2,KNN Classifier,0.810811
3,Random Forest Classifier,0.783784
4,LinearSVC,0.756757
5,AdaBoost Classifier,0.891892
6,Gradient Boosting Classifier,0.810811
7,XGB Classifier,0.783784
