<h1 style="color: green;">Deployment pipeline</h1>

In [53]:
import pandas as pd
import numpy as np

import sqlalchemy

from datetime import datetime


import joblib

# for feature engineering
from feature_engine.discretisation import EqualWidthDiscretiser
from feature_engine.discretisation import ArbitraryDiscretiser
from feature_engine.encoding import RareLabelEncoder
from feature_engine.outliers import ArbitraryOutlierCapper

# for Weight of evidence
from feature_engine.encoding import WoEEncoder
from category_encoders.woe import WOEEncoder

# Scaling the values
from sklearn.preprocessing import binarize

import os

# Loading environment variables
from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.filterwarnings('ignore')

# display all columns
pd.set_option('display.max_columns', None)

# connecting to database using URL connection string
sqlUrl = sqlalchemy.engine.URL.create(
    drivername = os.getenv('drivername'),
    username = os.getenv('username'),
    password = os.getenv('password'),
    host = os.getenv('host'),
    port = os.getenv('port'),
    database = os.getenv('database')
)

engine = sqlalchemy.create_engine(sqlUrl)


In [2]:
sql_str = """select a.Receipt_id,
                a.Venue_id,
                a.Total_Nbr_of_Items,
                a.Total_Price,
                a.Receipt_Date,
                b.Venue,
                c.Item_name,
                c.Item_Price

                from hs.receipt as a left join
                hs.venue_details as b
                on a.Venue_id = b.Venue_id

                left join hs.item as c
                on a.Receipt_id = c.Receipt_id

                where a.Receipt_date > curdate() -7;"""

last_7_days = pd.read_sql_query(sql_str,engine)

In [3]:
last_7_days.drop(['Venue_id'], axis=1, inplace=True)

# convert Receipt_Date to datetime
last_7_days['Receipt_Date'] =  pd.to_datetime(last_7_days.Receipt_Date, format="%Y-%m-%d")

# insert new shopping list data here

In [4]:
# guest receipt data
receipt_id_g = last_7_days.Receipt_id.max() + 1
venue_g = 'Asda'
Total_Nbr_of_Items_g = 11
Total_Price_g = 15.44
Receipt_Date_g = pd.to_datetime('2023-06-11', format="%Y-%m-%d")

Item_name_g = ['Whole chicken','Garlic pouch','Chopped tomatoes',
               'Chopped tomatoes','Chopped tomatoes','Lemons',
               'Limes','Vimto','Red onions','Peppers mixed','Colgate']
Item_Price_g = [4.98,2.79,0.32,0.32,0.32,0.79,0.79,2.50,0.67,0.97,0.99]

In [5]:
for i in range(len(Item_name_g)):
    new_row_i = {
        'Receipt_id': receipt_id_g,
        'Total_Nbr_of_Items': Total_Nbr_of_Items_g,
        'Total_Price': Total_Price_g,
        'Receipt_Date': Receipt_Date_g,
        'Venue': venue_g,
        'Item_name': Item_name_g[i],
        'Item_Price': Item_Price_g[i]
        
    }
    new_row_i = pd.DataFrame(new_row_i, columns=last_7_days.columns, index=[0])
    last_7_days = pd.concat([last_7_days, new_row_i], ignore_index=True)

# continue on with deriving features

In [6]:
last_7_days_date_diff =\
last_7_days[['Receipt_id','Receipt_Date',
             'Total_Price','Total_Nbr_of_Items']].drop_duplicates(subset=['Receipt_id'])

In [7]:
last_7_days_date_diff.sort_values('Receipt_Date',ascending=True, inplace=True)

last_7_days_date_diff['Date_diff'] =\
(last_7_days_date_diff.Receipt_Date - last_7_days_date_diff.Receipt_Date.shift()).dt.days.fillna(0)
last_7_days_date_diff

Unnamed: 0,Receipt_id,Receipt_Date,Total_Price,Total_Nbr_of_Items,Date_diff
0,925,2023-06-07,35.99,1,0.0
1,926,2023-06-07,14.49,1,0.0
2,927,2023-06-08,1.4,1,1.0
3,928,2023-06-09,1.8,1,1.0
4,929,2023-06-10,3.35,2,1.0
6,930,2023-06-11,15.44,11,1.0


In [8]:
# converting Receipt_Date to datetime to avoid error
last_7_days_date_diff['Receipt_Date'] = pd.to_datetime(last_7_days_date_diff.Receipt_Date, format='%Y-%m-%d')

# Deriving week_of_year
last_7_days_date_diff['week_of_year'] = last_7_days_date_diff.Receipt_Date.dt.isocalendar().year.map(str)+ "_" +  \
last_7_days_date_diff.Receipt_Date.dt.isocalendar().week.map(str)
last_7_days_date_diff.head()

Unnamed: 0,Receipt_id,Receipt_Date,Total_Price,Total_Nbr_of_Items,Date_diff,week_of_year
0,925,2023-06-07,35.99,1,0.0,2023_23
1,926,2023-06-07,14.49,1,0.0,2023_23
2,927,2023-06-08,1.4,1,1.0,2023_23
3,928,2023-06-09,1.8,1,1.0,2023_23
4,929,2023-06-10,3.35,2,1.0,2023_23


In [9]:

# Calculate number of items per week
last_7_days_date_diff['Nbr_items_per_wk'] =\
last_7_days_date_diff.groupby(['week_of_year'])['Total_Nbr_of_Items'].transform('sum')

# Calculate expenditure per week
last_7_days_date_diff['Expenditure_per_wk'] = \
last_7_days_date_diff.groupby(['week_of_year'])['Total_Price'].transform('sum')

# Calculating receipt Total_Price as a percentage of the weeks expenditure
last_7_days_date_diff['Total_Exp_wk_perc'] = \
last_7_days_date_diff.Total_Price / last_7_days_date_diff.Expenditure_per_wk


In [10]:
last_7_days_date_diff

Unnamed: 0,Receipt_id,Receipt_Date,Total_Price,Total_Nbr_of_Items,Date_diff,week_of_year,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc
0,925,2023-06-07,35.99,1,0.0,2023_23,17,72.47,0.496619
1,926,2023-06-07,14.49,1,0.0,2023_23,17,72.47,0.199945
2,927,2023-06-08,1.4,1,1.0,2023_23,17,72.47,0.019318
3,928,2023-06-09,1.8,1,1.0,2023_23,17,72.47,0.024838
4,929,2023-06-10,3.35,2,1.0,2023_23,17,72.47,0.046226
6,930,2023-06-11,15.44,11,1.0,2023_23,17,72.47,0.213054


In [11]:
# merge back
last_7_days = pd.merge(
    last_7_days,
    last_7_days_date_diff[['Receipt_id','Date_diff','week_of_year',
                           'Expenditure_per_wk','Total_Exp_wk_perc',
                           'Nbr_items_per_wk']],
    on='Receipt_id',
    how='left'
)

In [12]:
last_7_days.head()

Unnamed: 0,Receipt_id,Total_Nbr_of_Items,Total_Price,Receipt_Date,Venue,Item_name,Item_Price,Date_diff,week_of_year,Expenditure_per_wk,Total_Exp_wk_perc,Nbr_items_per_wk
0,925,1,35.99,2023-06-07,Amazon,Camping backpack 80l,35.99,0.0,2023_23,72.47,0.496619,17
1,926,1,14.49,2023-06-07,Amazon,Sleeping bag,14.49,0.0,2023_23,72.47,0.199945,17
2,927,1,1.4,2023-06-08,Post office,Item return stamp charge,1.4,1.0,2023_23,72.47,0.019318,17
3,928,1,1.8,2023-06-09,Tesco,Orange and mango juice 1l,1.8,1.0,2023_23,72.47,0.024838,17
4,929,2,3.35,2023-06-10,Asda,Vimto,2.5,1.0,2023_23,72.47,0.046226,17


In [13]:
# deriving bread
# breads
nots = ['garlic','ham','garlic','ham']
ins = ['bloomer','bread']

last_7_days['Bread'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                            any(word.lower() in sentence.lower() 
                                for word in ins) 
                            and not any(word.lower() in sentence.lower() 
                                        for word in nots) 
                            else 0)

In [14]:
# Cooked meats indicator
not_cook = ['glass','shampoo','water','conditioner','champagne']
cooked_meats = ['chicken pasty slices twin pack','steak and kidney pasty',
                'chicken cooked','cooked chicken','roast chicken thighs',
                'mackerel','salmon','pork pies classic','ham',
                'sardines','sausages cocktail','spicy chorizo sausages',
                'sausages rolls','sausage rolls','salami','meatballs']

last_7_days['Cooked_meats'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                            any(word.lower() in sentence.lower() 
                                                for word in cooked_meats) 
                                            and not any(word.lower() in sentence.lower() 
                                                        for word in not_cook) 
                                            else 0)

In [15]:
# Deriving Raw_meats
# Raw meats indicator
not_raw = ['pasty','cooked','roast','seasoning',
           'southern','fried','meal','piece',
           'box','bake','szechuan','pies',
           'mushroom','pie','salami','rolls','cocktails',
           'chips','chorizo']
raw_meats = ['bacon','chicken','lamb','gammon','sausages',
             'sausage','pork','fish','beef','eggs']

last_7_days['Raw_meats'] = last_7_days.Item_name.apply(lambda sentence: 1 if any(word.lower() in sentence.lower() 
                                                                   for word in raw_meats) 
                            and not any(word.lower() in sentence.lower() for word in not_raw) else 0)


In [16]:
# Creating snack indicator
not_snack = ['diesel','james']
snacks = ['snickers','digestive','digestives',
          'chocolate','yogurt','cake','cakes',
          'snack','nuts','donuts','doughnut',
          'mikati','fudge','maltesers','twix','marmalade',
          'jam','custard']

last_7_days['Snacks'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                      any(word.lower() in sentence.lower() for word in snacks)
                                      and not 
                                      any(word.lower() in sentence.lower() for word in not_snack)
                                     else 0)

In [17]:
# Creating drinks indicator variable
# Note: this includes alcoholic and non alcoholic drinks

not_drink = ['diesel','glass','socks','fan','heater',
             'beef','source','ironing','plaster','ham','lockets']

drinks = ['juice','vimto','ribena','squash','tropical','liquer',
          'dr pepper','coke','alcohol','beer','rubicon','courvoisier'
          'wine','irish','port','rum','original','smoothies','water',
          'honey','cordial','whiskey','whisky']

last_7_days['Drinks'] = last_7_days.Item_name.apply(lambda sentence: 1 if any(word.lower() in sentence.lower()
                                                                for word in drinks) 
                                      and not any(word.lower() in sentence.lower() 
                                                  for word in not_drink)
                                     else 0)

In [18]:
# Creating a vegetables indicator
not_veg = ['seed','bread','fried','black','dr','lisbon']
vegetables = ['cabbage','carrots','parsnip','greens','garlic','ginger',
              'tomatoes','onions','chillies','ngai ngai','leaf',
              'leaves','mushrooms','spinach','coriander','parsley',
              'broccoli','pumpkin','peas','peppers','cucumber','leeks',
             'brussel sprouts','mint','asparagus','beans','Soup']

last_7_days['Vegetables'] = last_7_days.Item_name.apply(lambda sentence: 
                                          1 if any(word.lower() in sentence.lower() 
                                                   for word in vegetables)
                                         and not any(word.lower() in sentence.lower() 
                                                     for word in not_veg)
                                         else 0)

In [19]:
# Creating a fruits indicator
not_fruit = ['juice','rubicon','original','smoothies','yogurt','cordial',
             'ribena','squash','volvic','water','lockets','bucket']
fruit = ['olives','apples','mango','grape','grapes','bananas',
          'lime','lemon','strawberries','oranges']

last_7_days['Fruit'] = last_7_days.Item_name.apply(lambda sentence: 1 if any(word.lower() in sentence.lower() 
                                                               for word in fruit) 
                                     and not any(word.lower() in sentence.lower() 
                                                 for word in not_fruit)
                                    else 0)

In [20]:
# Creating an indicator for cooking base
not_base = ['fried']
cooking_base = ['pasta','spaghetti','rice','flour','potatoe','potatoes','potato']

last_7_days['Cooking_base'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                            any(word.lower() in sentence.lower() 
                                                for word in cooking_base) 
                                            and not any(word.lower() in sentence.lower() 
                                                        for word in not_base) 
                                            else 0)

In [21]:
# Creating an indicator for Dairy produce
dairy_produce = ['cheese','brilliantly','butter','butterlicious','spread','margarine']
last_7_days['Dairy_produce'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                             any(word.lower() in sentence.lower() 
                                                 for word in dairy_produce) 
                                             else 0)

In [22]:
# Creating an indicator for seasoning
seasoning = ['black pepper','salt','seasoning','spice','cinnamon','paprika']

last_7_days['Seasoning'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                         any(word.lower() in sentence.lower() 
                                             for word in seasoning) 
                                         else 0 )

In [23]:
# creating an indicator for breakfast food
breakfast = ['granola','muesli','sultanas','porridge']

last_7_days['Breakfast'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                         any(word.lower() in sentence.lower() 
                                             for word in breakfast) 
                                         else 0 )

In [24]:
# creating an indicator for education
not_edu = ['clevo']
education = ['king richard williams','linkedin','mysql','financial','python',
             'bootcamp','web server','linux','apache','sas','pencils',
             'eraser','whs 15cm ruler','bic pen','a4','binders','feature', 
             'engineering','full stack', 'optimization','machine learning',
             'tensor flow','pytorch','statistics','hadoop','sas', 'regression',
             'bootcamp','javascript','research methodology','quantitative']

last_7_days['Education'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                         any(word.lower() in sentence.lower() 
                                             for word in education) 
                                         and not 
                                         any(word.lower() in sentence.lower() 
                                             for word in not_edu) 
                                         else 0)

In [25]:
# Creating an indicator for cosmetics and self care

not_cosmetic = ['sony']
cosmetics_and_selfcare = ['shampoo','shower','tooth','colgate','wisdom','nivea',
                          'razor','body','blades','aqueous','shave','african',
                          'perfume','brut','roll on','Roll-on','bettina bath']

last_7_days['Cosmetics_and_selfcare'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                                      any(word.lower() in sentence.lower() 
                                                          for word in cosmetics_and_selfcare) 
                                                      and not 
                                                      any(word.lower() in sentence.lower() 
                                                          for word in not_cosmetic) 
                                                      else 0)

In [26]:
# creating an indicator for house and kitchen 
not_house = ['cake','bonnlo']
house_and_kitchen = ['fairy liquid','measure jug','fitted bed sheet','glass',
                     'turner (spatula)','rolling pin','fairy liquid','orange citrus',
                     'turkey baster','dish drainer','extension lead','grater','power spray',
                     'liquid','roaster and rack','kitchen roller','salad tongs',
                     'strainer 12cm','arial pods','metal scourer','bathmat','curtain hooks',
                     'ant killer spray','scouring pads','sponge','surf','foil','plaster',
                     'knife sharpener','electric hand mixer','athena cotton wool','mop',
                     'ofargo']

last_7_days['House_and_kitchen'] = last_7_days.Item_name.apply(lambda sentence: 1 if 
                                                 any(word.lower() in sentence.lower() 
                                                     for word in house_and_kitchen) 
                                                 and not 
                                                 any(word.lower() in sentence.lower() 
                                                     for word in not_house) 
                                                 else 0)

# Below derive the remaining week features

In [27]:
features = ['Total_Nbr_of_Items', 'Venue', 'Date_diff', 'Nbr_items_per_wk',
               'Expenditure_per_wk', 'Total_Exp_wk_perc', 'Drinks', 'Vegetables',
               'Cosmetics_and_selfcare', 'House_and_kitchen', 'Bread_wk',
               'Cooked_meats_wk', 'Raw_meats_wk', 'Snacks_wk', 'Snacks_exp_receipt',
               'Snacks_exp_wk', 'Drinks_wk', 'Drinks_exp_wk', 'Vegetables_exp_wk',
               'Fruit_wk', 'Cooking_base_wk', 'Dairy_produce_wk', 'Seasoning_wk',
               'Breakfast_wk', 'Education_wk', 'Cosmetics_and_selfcare_wk',
               'Cosmetics_and_selfcare_wk_exp_perc', 'House_and_kitchen_wk']

In [28]:
indicator_list = ['Bread', 'Cooked_meats', 'Raw_meats', 'Snacks', 'Drinks', 'Vegetables','Fruit',
                   'Cooking_base', 'Dairy_produce', 'Seasoning', 'Breakfast', 'Education', 
                   'Cosmetics_and_selfcare', 'House_and_kitchen'
                  ]

# Looping through the indicator list to derive new features
for x in indicator_list:
    # Calculate item x count by shopping trip(Receipt) and by week
    last_7_days["{}_receipt".format(x)] = last_7_days.groupby(['Receipt_id'])[x].transform('sum')
    last_7_days["{}_wk".format(x)] = last_7_days.groupby(['week_of_year'])[x].transform('sum')

    # Receipt item x as a proportion of week's item x
    last_7_days["{}_wk_perc".format(x)] = last_7_days["{}_receipt".format(x)] / last_7_days["{}_wk".format(x)]

    # Calculating item x expenditure by shopping trip(Receipt) and by week
    last_7_days["{}_exp_receipt".format(x)] = \
    last_7_days.query("{}==1".format(x)).groupby(['Receipt_id',x])['Item_Price'].transform('sum')
    
    last_7_days["{}_exp_wk".format(x)] = \
    last_7_days.query("{}==1".format(x)).groupby(['week_of_year',x])['Item_Price'].transform('sum')
    
    # Receipt item x expenditure as a proportion of week's item x expenditure
    last_7_days["{}_wk_exp_perc".format(x)] = \
    last_7_days["{}_exp_receipt".format(x)] / last_7_days["{}_exp_wk".format(x)]

In [29]:
# list of features derived from item_name
list3 = ['Bread', 'Cooked_meats', 'Raw_meats', 'Snacks', 'Drinks', 'Vegetables','Fruit',
           'Cooking_base', 'Dairy_produce', 'Seasoning', 'Breakfast', 'Education', 
           'Cosmetics_and_selfcare', 'House_and_kitchen'
        ]

# summing the indicator by receipt to get the total count for the receipt
for x in list3:
    z = last_7_days.groupby(['Receipt_id'])[x].transform('sum')
    last_7_days[x] = z

In [30]:
last_7_days[features].head()

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
0,1,Amazon,0.0,17,72.47,0.496619,0,0,0,0,0,0,1,1,,,3,,,2,0,0,0,0,0,1,,0
1,1,Amazon,0.0,17,72.47,0.199945,0,0,0,0,0,0,1,1,,,3,,,2,0,0,0,0,0,1,,0
2,1,Post office,1.0,17,72.47,0.019318,0,0,0,0,0,0,1,1,,,3,,,2,0,0,0,0,0,1,,0
3,1,Tesco,1.0,17,72.47,0.024838,1,0,0,0,0,0,1,1,,,3,6.8,,2,0,0,0,0,0,1,,0
4,2,Asda,1.0,17,72.47,0.046226,1,0,0,0,0,0,1,1,,,3,6.8,,2,0,0,0,0,0,1,,0


In [31]:
fill_na_list = ['Bread_wk','Cooked_meats_wk','Raw_meats_wk','Snacks_wk', 'Snacks_exp_receipt', 'Snacks_exp_wk',
           'Drinks_wk', 'Drinks_exp_wk', 'Vegetables_exp_wk', 'Fruit_wk', 'Cooking_base_wk', 'Dairy_produce_wk',
           'Seasoning_wk','Breakfast_wk', 'Education_wk', 'Cosmetics_and_selfcare_wk', 
            'Cosmetics_and_selfcare_wk_exp_perc', 'House_and_kitchen_wk'
          ]

In [32]:
# filling in the missing values by receipt with the max value

for x in fill_na_list:
    y = last_7_days.groupby(['Receipt_id'])[x].transform('max')
    last_7_days[x]= last_7_days.groupby(['Receipt_id'])[x].apply(lambda x: x.fillna(y))

In [33]:
# filling the remaining missing values
last_7_days.fillna(0, inplace=True)

In [34]:
# de-duplicate the data using Receipt_id
last_7_days = last_7_days.drop_duplicates(subset=['Receipt_id'])

In [35]:
# return only the max receipt_id data
last_7_days = last_7_days.query("Receipt_id == Receipt_id.max()")
last_7_days

Unnamed: 0,Receipt_id,Total_Nbr_of_Items,Total_Price,Receipt_Date,Venue,Item_name,Item_Price,Date_diff,week_of_year,Expenditure_per_wk,Total_Exp_wk_perc,Nbr_items_per_wk,Bread,Cooked_meats,Raw_meats,Snacks,Drinks,Vegetables,Fruit,Cooking_base,Dairy_produce,Seasoning,Breakfast,Education,Cosmetics_and_selfcare,House_and_kitchen,Bread_receipt,Bread_wk,Bread_wk_perc,Bread_exp_receipt,Bread_exp_wk,Bread_wk_exp_perc,Cooked_meats_receipt,Cooked_meats_wk,Cooked_meats_wk_perc,Cooked_meats_exp_receipt,Cooked_meats_exp_wk,Cooked_meats_wk_exp_perc,Raw_meats_receipt,Raw_meats_wk,Raw_meats_wk_perc,Raw_meats_exp_receipt,Raw_meats_exp_wk,Raw_meats_wk_exp_perc,Snacks_receipt,Snacks_wk,Snacks_wk_perc,Snacks_exp_receipt,Snacks_exp_wk,Snacks_wk_exp_perc,Drinks_receipt,Drinks_wk,Drinks_wk_perc,Drinks_exp_receipt,Drinks_exp_wk,Drinks_wk_exp_perc,Vegetables_receipt,Vegetables_wk,Vegetables_wk_perc,Vegetables_exp_receipt,Vegetables_exp_wk,Vegetables_wk_exp_perc,Fruit_receipt,Fruit_wk,Fruit_wk_perc,Fruit_exp_receipt,Fruit_exp_wk,Fruit_wk_exp_perc,Cooking_base_receipt,Cooking_base_wk,Cooking_base_wk_perc,Cooking_base_exp_receipt,Cooking_base_exp_wk,Cooking_base_wk_exp_perc,Dairy_produce_receipt,Dairy_produce_wk,Dairy_produce_wk_perc,Dairy_produce_exp_receipt,Dairy_produce_exp_wk,Dairy_produce_wk_exp_perc,Seasoning_receipt,Seasoning_wk,Seasoning_wk_perc,Seasoning_exp_receipt,Seasoning_exp_wk,Seasoning_wk_exp_perc,Breakfast_receipt,Breakfast_wk,Breakfast_wk_perc,Breakfast_exp_receipt,Breakfast_exp_wk,Breakfast_wk_exp_perc,Education_receipt,Education_wk,Education_wk_perc,Education_exp_receipt,Education_exp_wk,Education_wk_exp_perc,Cosmetics_and_selfcare_receipt,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_perc,Cosmetics_and_selfcare_exp_receipt,Cosmetics_and_selfcare_exp_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_receipt,House_and_kitchen_wk,House_and_kitchen_wk_perc,House_and_kitchen_exp_receipt,House_and_kitchen_exp_wk,House_and_kitchen_wk_exp_perc
6,930,11,15.44,2023-06-11,Asda,Whole chicken,4.98,1.0,2023_23,72.47,0.213054,17,0,0,1,0,1,6,2,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,1,1,1.0,4.98,4.98,1.0,0,1,0.0,0.0,0.0,0.0,1,3,0.333333,0.0,6.8,0.0,6,6,1.0,0.0,5.39,0.0,2,2,1.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,1,1,1.0,0.0,0.0,1.0,0,0,0.0,0.0,0.0,0.0


In [36]:
# Restricting the model features 
last_7_days = last_7_days[features]
last_7_days

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
6,11,Asda,1.0,17,72.47,0.213054,1,6,1,0,0,0,1,1,0.0,0.0,3,6.8,5.39,2,0,0,0,0,0,1,1.0,0


# Load the data transformers

<h3 style="color: green;">Capping Expenditure_per_wk in preparation for the model</h3>

In [37]:
capper = joblib.load("../Data transformers/capper_expenditure12062023")

In [38]:
last_7_days = capper.transform(last_7_days)
last_7_days

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
6,11,Asda,1.0,17,72.47,0.213054,1,6,1,0,0,0,1,1,0.0,0.0,3,6.8,5.39,2,0,0,0,0,0,1,1.0,0


<h3 style="color: green;">Binning Expenditure_per_wk</h3>

In [39]:
Expenditure_bin = joblib.load("../Data transformers/Equal_width_bin_expenditure12062023")
last_7_days = Expenditure_bin.transform(last_7_days)

<h3 style="color: green;">Binning high cardinality features</h3>

In [40]:
high_card_num = ['Total_Nbr_of_Items', 'Nbr_items_per_wk', 'Expenditure_per_wk', 'Total_Exp_wk_perc', 'Drinks',
 'Vegetables', 'Cosmetics_and_selfcare', 'Bread_wk', 'Cooked_meats_wk', 'Raw_meats_wk',
 'Snacks_wk', 'Snacks_exp_receipt', 'Snacks_exp_wk', 'Drinks_wk', 'Drinks_exp_wk', 'Vegetables_exp_wk',
 'Fruit_wk', 'Cooking_base_wk', 'Dairy_produce_wk', 'Breakfast_wk', 'Education_wk', 'Cosmetics_and_selfcare_wk',
 'Cosmetics_and_selfcare_wk_exp_perc', 'House_and_kitchen_wk'
]

# Bin/transform high cardinality
high_card_bin = joblib.load("../Data transformers/Hig_cardinality_12062023")
# last_7_days_high_card = high_card_bin.transform(last_7_days[high_card_num])

last_7_days = high_card_bin.transform(last_7_days)

# last_7_days_high_card
last_7_days

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
6,1,Asda,1.0,1,2,1,0,2,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,5,0


<h3 style="color: green;">Transforming rare labels for high cardinality</h3>

In [41]:
# change variable type to object in preparation for rare value fit and transform
for var in high_card_num:
    #last_7_days_high_card[var] = pd.Series(last_7_days_high_card[var], dtype=object)
    last_7_days[var] = pd.Series(last_7_days[var], dtype=object)

rare_encoder_high_card = joblib.load("../Data transformers/Rare_enc_High_cardinality_12062023")

# last_7_days_high_card = rare_encoder_high_card.transform(last_7_days_high_card)
last_7_days = rare_encoder_high_card.transform(last_7_days)

In [42]:
# last_7_days_high_card
last_7_days

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
6,1,Asda,1.0,1,2,1,0,2,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,5,0


<h3 style="color: green;">Binning Date_diff and transforming</h3>

In [43]:
# Date_diff transforming
Date_diff_trans = joblib.load("../Data transformers/Date_diff_transformer_12062023")

last_7_days = Date_diff_trans.transform(last_7_days)

last_7_days

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
6,1,Asda,2,1,2,1,0,2,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,5,0


<h3 style="color: green;">Transforming low cardinality rare labels</h3>

In [44]:
low_card_num = ['House_and_kitchen', 'Seasoning_wk']

# change variable type to object in preparation for rare label encoding
for var in low_card_num:
    last_7_days[var] = pd.Series(last_7_days[var], dtype=object)


low_card_enc =  joblib.load("../Data transformers/low_cardinality_rare_transformer_12062023")

last_7_days = low_card_enc.transform(last_7_days)
last_7_days

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
6,1,Asda,2,1,2,1,0,2,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,5,0


<h3 style="color: green;">Transforming categorical rare labels</h3>

In [45]:
Categorical_enc = joblib.load("../Data transformers/Categorical_rare_transformer_12062023")

last_7_days = Categorical_enc.transform(last_7_days)

<h3 style="color: green;">WOE transformation</h3>

In [46]:
woe_enc = joblib.load("../Data transformers/WOE_transformer_12062023")
last_7_days = woe_enc.transform(last_7_days)

In [47]:
last_7_days.head()

Unnamed: 0,Total_Nbr_of_Items,Venue,Date_diff,Nbr_items_per_wk,Expenditure_per_wk,Total_Exp_wk_perc,Drinks,Vegetables,Cosmetics_and_selfcare,House_and_kitchen,Bread_wk,Cooked_meats_wk,Raw_meats_wk,Snacks_wk,Snacks_exp_receipt,Snacks_exp_wk,Drinks_wk,Drinks_exp_wk,Vegetables_exp_wk,Fruit_wk,Cooking_base_wk,Dairy_produce_wk,Seasoning_wk,Breakfast_wk,Education_wk,Cosmetics_and_selfcare_wk,Cosmetics_and_selfcare_wk_exp_perc,House_and_kitchen_wk
6,0.217884,0.807895,-0.619559,0.629976,-0.418084,0.293031,0.026222,0.444537,0.003698,0.008584,0.185634,0.116289,0.387766,0.448801,-0.023692,0.006782,-1.64143,0.049556,1.046427,-0.075072,0.100355,0.129093,0.081388,-0.199046,0.107108,0.344791,0.723588,0.092007


<h1 style="color: green;">Load stored model</h1>

In [48]:
xgb_loaded = joblib.load('../Models/xgboost_classifier10062023')

In [51]:
# Making predictions with stored model
loaded_prob = xgb_loaded.predict_proba(last_7_days)[:,1]

In [54]:
loaded_pred_class_binarize = binarize([loaded_prob],threshold=0.14)[0]
loaded_pred_class_binarize

array([0.], dtype=float32)

<h1 style="color: green;">Explaining the model with </h1>