## Analysis set-up

### Import required packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
from numpy import median
import geopandas as gpd
#import contextily as cx
import plotly.express as px
import plotly.graph_objects as go
import string
import random
import numpy as np
import re
pd.set_option('display.max_rows', 50)

### Set file path

In [None]:
file_path = '/Users/franpontin/OneDrive - University of Leeds/CDRC/Projects/IGD/Analysis/product_data/'


### Define functions

#### Function to report classification progress 

In [None]:
def progress(df):
  #gives progress on EW calssificaiton
   print('N left to classify:',len(df.loc[df['Eatwell_Segment']=='Not Food',:]),
      '(Percentage classified =', 
      np.round(100-len(df.loc[df['Eatwell_Segment']=='Not Food',:])/len(df)*100,2), '%)')


#### Function to match product descritpiton/name to Eatwell categroy 

In [None]:
def match_to_EW(df, list_of_columns):
    df[list_of_columns] =df[list_of_columns].astype('string')
    n=len(list_of_columns)
    ewList =['EW_' + x for x in list(map(str, list(range(0, n))))]
    df[ewList]=np.nan
    for i in range(len(ew_cat_wordLists)):
        wordList = ew_cat_wordLists[i]
        print(i+1, 'categories matched of', len(ew_cat_wordLists)) 
        for word in wordList:
            for j in range(len(w_segList)):
            # find where there is a word to macth, 
            # and it has not yet been matched in another cat (order of cat names important)
            # match the word to the word list and assign EW cat to word
                        df.loc[(df[w_segList[j]].notnull())&
                        (~df[ewList[j]].notnull())&
                        (df[w_segList[j]].str.contains(word, case=False, )),#flags=re.IGNORECASE, regex=True
                        ewList[j]] = ew_cat[i]
    df['nunique_EW']=df[ewList].apply(pd.Series.nunique, axis=1)
    #example['EW_multi_cats'] =example[['EW_0','EW_1','EW_2','EW_3','EW_4','EW_5','EW_6','EW_7','EW_8','EW_9','EW_10','EW_11','EW_12']].fillna("").astype(str).fillna("").apply(lambda x: ' '.join(x), axis=1).fillna("")
    df['EW_multi_cats'] =df[ewList].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)
    df['total_seg_matched']=df[ewList].count(axis=1)  
    mode_n =df[['EW_0','EW_1','EW_2','EW_3','EW_4','EW_5','EW_6']].mode(axis=1).shape[1]
    modeList =['mode_' + x for x in list(map(str, list(range(1, mode_n+1))))]
    df[modeList] =df[ewList].mode(axis=1)
    df['Eatwell_Segment'] = 'Not Food'

#### Function to assign Eatwell classification based on matching, including detail of logic behind the match

In [None]:
def assign_EW(df, conditions, confidence_val, confidence_scale, match_detail, ew_cat):
    print('N='+str(len(df.loc[conditions,:])))
    df.loc[conditions, 'class_confidence_calc']=confidence_val
    df.loc[conditions, 'class_confidence']= confidence_scale
    df.loc[conditions, 'match_detail']= match_detail
    df.loc[conditions&(df['Eatwell_Segment']=='Not Food'),'Eatwell_Segment'] = ew_cat
    print('Match conditions:',match_detail)
    progress(df)
    

### Read in test product data and additional list of fruit and veg

In [None]:
df = pd.read_csv(file_path+'product_data.csv')
veg_list =pd.read_csv(file_path+'veg_list.csv',index_col=0)

veg_list =list(veg_list['name'])

## Define products in each Eatwell category

In [None]:


ew_fnv = veg_list+['courgette','dudhi','mooli', 'gourd','melon','currants', 'sultanas','swede','veg',
                   'fruit','gherkin','pickle','olives']
ew_car = ['bread','cereal','flour','pasta','pastry','potato','rolls',r'\brice\b',
          'waffles','zwieback', 'muffin', 'teff', 'bagel', r'\bcorn\b', 'spaghetti',
          'whole wheat', 'crackers', 'noodles', 'wheat', 'couscous','cous cous', 'tortilla',
          'nann','taco','pitta','wrap','chapati','chappatti', 'chapatti','sourdough',r'\bpave\b','boule','bloomer','paratha',
          'bao','cassava','naan','porridge','museli', 'muesli','baguette','quinoa','ciabatta',r'\broti\b',
          'sourdough', 'baps', 'brioche','pennoni', 'loaf','fusilli','macaroni','focaccia'] # zwieback is carb or disc? doesn't matter for this tho

ew_ptn = ['beef','chicken','domestic eggs', 'lamb','fish','frankfurter',r'\bham\b',
          'liver loaf','meat','nut','pork','sausage','turkey', 'crustaceans',
          'bean', 'chickpea', 'veal','poultry', 'steak', 'neck', 'mince',
         'fillet', 'loin', 'ostrich', 'egg', 'mollusk', 'emu', 'goose', 'duck',
          'Goat', 'lentils', 'marrow', 'flesh', 'tofu','beancurd', 'prawn', 'hummus', 'houmous',
         'haddock','salmon',r'\bcod\b','capers','tuna','breast','lobster','thighs','drumstick',
          'prosciutto','wagyu','joint', 'quorn', 'bacon', 'brisket','burger','almonds', 
          r'\bnuts\b','pistachios','pecans','butter beans','rotisserie','mortadella','kala chana',
         'liver','pig in blanket','seafood'] # marrow very fatty

ew_oil = ['butter','margarine',r'\boil\b', 'lard','spreads','vegetable oil', 'sunfower oil']
ew_dai = ['milk','whole milk','butter milk','cream','cheese','curd','yogurt', 'yoghurt','shortening',
          r'\bmilk alternatives\b', r'\blactose free drinks\b', r'\boat & nut drinks\b','kefir','cheesy',
          'brie','mozzerella','mozzarella','cheesy single', 'cheddar', 'gloucester','leicester','soya','halloumi',
          'wensleydale','coconut milk','stilton', 'paneer','feta','Lactofree','fraiche']

ew_dis = ['candy','cake','sweetener','chocolate','choc','dessert','ice cream',
          'mustard','mayonnaise','salty snack','pudding powder','popcorn',
          'sweet spreads','syrup','sugar','jam','ketchup', 'fudge', 'cocoa',
          'snack products','sauce', 'cookie','pastries', 'pudding', 'pancake',
         'candies', 'wafer', 'fries', 'crumpets', 'muffins', 'crisps','jam','condiments',
          'gravy', 'yeast', 'marmalade', 'conserve','caramel','brownie','bakewell','profiterole',
          'Soreen','chips','biscuit', 'scone', 'snack', 'marzipan','toffee','custard',r'\bicing\b',
          'jelly','stollen','poppadom', 'poppadum','tiramisu','croissant','macaroon','mringue','cheesecake','salad cream',
         r'\bpain au rasin\b','shortbread','honey','salami','cheesecake','panna cotta','sweets', 'mints', 'donner kerbab',
          'hash brown',r'\bhot cross bun\b','pick and mix','sharing bag','crouton', 'hot cross bun','bakery',
          'dressing','flapjack','swiss roll','pain au raisin','mallow','preserve','trifle']

ew_alc = ['brandy','beer','wine','prosecco','liqueur','liquor',r'\brum\b','whisky', 'cocktail',
          'Liqueur', r'\bgin\b','sauvignon','ABV','%','daiquiry','martini','cava','rioja','pilsner','sherry','cider',r'\bport\b']
ew_com = ['nut/prunes','soup','Instant food products',r'\bfrozen meal\b', 'pizza', 'lasagna',
          'lasagne','pie', 'burito','pakora','sandwich', 'ready meal', r'\noodle pots\b',
          'haggis','takeaway', 'meal', 'curry','balti','masala','quiche',r'\bpate\b','bolognese',
          'bolognaise','carbonara','platter','gyoza', 'coleslaw','pasta bake','granola','sping roll',
          'scampi','croquette','sandwich','party food','chilli con carne','hot dog','kmoussaka','enchilada',
          'fajita',r'\bstir fry\b', 'katsu','salsa']

# Add cateogry for infant formula etc. (not incl;uded in adult recommendations)
ew_inf = ['formula','babyfood', 'toddler','infant','formula']

ew_other =['seasoning', 'baking', 'spices','medical','coriander','HERBS & SPICES',  r'\bice cube\b', 'salt']
# add beverage cat.

ew_bev = ['juice', 'beverage', 'fizzy drink', 'cola', r'\bcoke\b',r'\bminneral water\b', 'mineral','drink',
          'coffee', 'sparkling water','tea', 'lucozade','decaf','tonic','soda','Kombucha', 'bottle', r'\bcan\b',
          'Sarsaparilla','smoothie','latte', r'\bginger ale\b', 'water', r'\balcohol free\b', 'ginger ale']

# add exclude - food items without significant nutritional contribution in terms of eat well
# e.g. spices, herbs, make sure to check salt and sugar)

#ew_exc = []

ew_cat = ['Fruit & Veg', 'Starchy Carbs', 'Protein', 'Oils & Spreads', 'Dairy & Alt.',
          'Discretionary', 'Alcoholic Bev.','Composite', 'Infant', 'Non Alcoholic Bev.', 'Other']

ew_cat_wordLists = [ew_fnv,ew_car,ew_ptn,ew_oil,ew_dai,ew_dis,ew_alc,ew_com, ew_inf, ew_bev, ew_other]



### Match each product name/description to Eatwell category/catergories

In [None]:
w_segList =['Shelf_1','Shelf_2', 'Shelf_3', 'Shelf_4', 'Shelf_5','MDSE Subcategory Name','MDSE Category Name']
match_to_EW(df, w_segList)

In [None]:
print('Percentage with no match on inital join: '+str(round(len(df.loc[df['total_seg_matched']<1])/len(df)*100,2))+"%")

In [None]:
assign_EW(df=df, 
          conditions=((df['nunique_EW']==1 )& (df['total_seg_matched']>1)), 
          confidence_val=1.00, 
          confidence_scale='High',
          match_detail='Product name matched to single EW category more than once',
          ew_cat=df['mode_1'])

In [None]:
assign_EW(df=df, 
          conditions=((df['nunique_EW']==1 )& (df['total_seg_matched']==1)), 
          confidence_val=1.00, 
          confidence_scale='Fairly High',
          match_detail='Product name matched to single EW category once',
          ew_cat=df['mode_1'])

In [None]:
assign_EW(df=df, 
          conditions=((df['nunique_EW']>=2)&
            (df['mode_2'].isna())&
            (df['mode_3'].isna())&
            (df['Eatwell_Segment']=='Not Food')),
          confidence_val=df['nunique_EW']/df['total_seg_matched'], 
          confidence_scale='Good',
          match_detail='Product name matched to multiple EW categories, most common and first cateogry matched are the same',
          ew_cat=df['mode_1'])

In [None]:
df['match_detail'].value_counts()

In [None]:
assign_EW(df=df, 
          conditions=((df['total_seg_matched']==2)&(~df['mode_2'].isna())&(df['Eatwell_Segment']=='Not Food')&
                          # and one of those EW cats is composite class as composite
                          (df['EW_multi_cats'].str.contains('Composite', case=False))),
          confidence_val=1/df['nunique_EW'], 
          confidence_scale='Good',
          match_detail="Product name matched to multiple EW categories, more than one most common EW segment (equal likelihood), however one of the most common EW segments is a `Composite` food so classed as such ",
          ew_cat='Composite')

In [None]:
assign_EW(df=df, 
          conditions=(df['total_seg_matched']==2)&(~df['mode_2'].isna())&(df['Eatwell_Segment']=='Not Food')&
       # if one of the EW cats mathced is 'Discretioanry' then class as discreitoanry
       (df['EW_multi_cats'].str.contains('Discretionary', case=False)),
          confidence_val=1/df['nunique_EW'], 
          confidence_scale='Good',
          match_detail="Product name matched to multiple EW categories, more than one most common EW segment (equal likelihood), however one of the most common EW segments is a `Disceretionary` food so classed as such ",
          ew_cat='Discretionary')

### For unmatched products split the product description and classify each component of the product description

In [None]:
print ('For the remaining', df.loc[(df['Eatwell_Segment']=='Not Food'),:].shape[0], 'products split the product descirption and match based on indvidual components of the description')

In [None]:
split_name =df.loc[(df['Eatwell_Segment']=='Not Food'),'name'].str.split(' ', expand=True)
split_name.columns =['w0','w1','w2','w3','w4','w5','w6','w7','w8','w9','w10','w11',
                     'w12','w13','w14','w15']
split_name =split_name.merge(df[['name','Retailer Barcode']], left_index=True, right_index=True, how='left')
w_segList =['w0','w1','w2','w3','w4','w5','w6','w7','w8','w9','w10','w11',
                     'w12','w13','w14','w15']

match_to_EW(split_name, w_segList)

In [None]:
assign_EW(df=split_name, 
          conditions=((split_name['nunique_EW']==1 )& (split_name['total_seg_matched']>1)), 
          confidence_val=1.00, 
          confidence_scale='High',
          match_detail='Product name matched to single EW category more than once',
          ew_cat=df['mode_1'])

In [None]:
assign_EW(df=split_name, 
          conditions=((split_name['nunique_EW']==1 )& (split_name['total_seg_matched']==1)), 
          confidence_val=1.00, 
          confidence_scale='Fairly High',
          match_detail='Product name matched to single EW category once',
          ew_cat=df['mode_1'])

In [None]:
assign_EW(df=split_name, 
          conditions=((split_name['nunique_EW']>=2)&
            (split_name['mode_2'].isna())&
            (split_name['mode_3'].isna())&
            (split_name['Eatwell_Segment']=='Not Food')),
          confidence_val=split_name['nunique_EW']/split_name['total_seg_matched'], 
          confidence_scale='Good',
          match_detail='Product name matched to multiple EW categories, most common and first cateogry matched are the same',
          ew_cat=split_name['mode_1'])

In [None]:
assign_EW(df=split_name, 
          conditions=((split_name['total_seg_matched']==2)&(~split_name['mode_2'].isna())&(df['Eatwell_Segment']=='Not Food')&
                          # and one of those EW cats is composite class as composite
                          (split_name['EW_multi_cats'].str.contains('Composite', case=False))),
          confidence_val=1/split_name['nunique_EW'], 
          confidence_scale='Good',
          match_detail="Product name matched to multiple EW categories, more than one most common EW segment (equal likelihood), however one of the most common EW segments is a `Composite` food so classed as such ",
          ew_cat='Composite')

In [None]:
assign_EW(df=split_name, 
          conditions=(split_name['total_seg_matched']==2)&(~split_name['mode_2'].isna())&(split_name['Eatwell_Segment']=='Not Food')&
       # if one of the EW cats mathced is 'Discretioanry' then class as discreitoanry
       (split_name['EW_multi_cats'].str.contains('Discretionary', case=False)),
          confidence_val=1/split_name['nunique_EW'], 
          confidence_scale='Good',
          match_detail="Product name matched to multiple EW categories, more than one most common EW segment (equal likelihood), however one of the most common EW segments is a `Disceretionary` food so classed as such ",
          ew_cat='Discretionary')


### Combine product description matched data with disaggregated matches 

In [None]:
new_df =df
new_df = new_df.drop(split_name.index)
new_df =pd.concat([new_df, split_name], axis=0)
new_df



In [None]:
assign_EW(df=new_df, 
          conditions=((new_df['name'].str.contains('flour', case=False))&(new_df['Eatwell_Segment']=='Not Food')),
          confidence_val= 1.00,
          confidence_scale='Fairly High',
          match_detail="Product name contains flour so classed as starchy carb.",
          ew_cat =  'Starchy Carbs')
          

assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&(new_df['nunique_EW']==2) 
               &(new_df['EW_multi_cats']=='Fruit & Veg, Non Alcoholic Bev.')
               &(new_df['name'].str.contains('in '))), 
          confidence_val=0.5, 
          confidence_scale='ok', 
          match_detail= 'Product name matched to 2 EW categorys, fruit & veg and non alcoholic bev, if contains `in` class as fruit and veg else class as non-alocholic bev.', 
          ew_cat='Fruit & Veg')
assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&(new_df['nunique_EW']==2) 
               &(new_df['EW_multi_cats']=='Fruit & Veg, Non Alcoholic Bev.')
               &(~new_df['name'].str.contains('in '))), 
          confidence_val=0.5, 
          confidence_scale='ok', 
          match_detail= 'Product name matched to 2 EW categorys, fruit & veg and non alcoholic bev, if contains `in` class as fruit and veg else class as non-alocholic bev.', 
          ew_cat='Non Alcoholic Bev.')            

assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&
                                  (new_df['nunique_EW']==2)&
                                  (new_df['EW_multi_cats']=='Fruit & Veg, Starchy Carbs')), 
          confidence_val=0.5, 
          confidence_scale='ok', 
          match_detail= 'Product name matched to 2 EW categorys, protein and carb. or protein and fruit/veg. clasify as composite', 
          ew_cat='Starchy Carbs')    

assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&
                                  (new_df['nunique_EW']==2)&
                                  (new_df['EW_multi_cats']=='Protein, Starchy Carbs')), 
          confidence_val=0.5, 
          confidence_scale='ok', 
          match_detail= 'Product name matched to 2 EW categorys, protein and carb. or protein and fruit/veg. clasify as composite', 
          ew_cat='Starchy Carbs')    

assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')& 
               (new_df['name'].str.contains('%'))), 
          confidence_val=0.5, 
          confidence_scale='Good', 
          match_detail= 'Product name matched to 2 EW categorys, if product name contains % (alcohol) class as alcoholic bev.', 
          ew_cat='Alcoholic Bev.')   

assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&
                                  (new_df['nunique_EW']==2)&
                                  (new_df['name'].str.contains('yogurt|yoghurt',case=False))), 
          confidence_val=0.5, 
          confidence_scale='Good', 
          match_detail= 'Product name matched to 2 EW categorys, if product name contains yoghurt class as dairy', 
          ew_cat='Dairy & Alt.')  

assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&(new_df['name'].str.contains('with'))), 
          confidence_val=0.5, 
          confidence_scale='Good', 
          match_detail= 'Product name matched to 2 EW categorys, if the name contians `with` classify as composite', 
          ew_cat='Dairy & Alt.')   

In [None]:
assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&(new_df['nunique_EW']>=2)), 
          confidence_val=0.5, 
          confidence_scale='Good', 
          match_detail= 'Product name matched to 2 or more EW categorys, if not already matched classify as composite', 
          ew_cat='Composite')   



In [None]:
assign_EW(df=new_df,
          conditions=((new_df['Eatwell_Segment']=='Not Food')&(new_df['name'].str.contains(r'\bice\b', case=False))), 
          confidence_val=0.9, 
          confidence_scale='Good', 
          match_detail= 'Product name contians `ice` classify as other', 
          ew_cat='Other')   

In [None]:
new_df['Eatwell_Segment'].value_counts().plot(kind='bar')

In [None]:
list(new_df)

In [None]:
# use interactive histograms to identify outliers
fig = px.histogram(new_df.fillna(''), x='kcal/100g', y='Protein/100g', color="Eatwell_Segment", marginal="rug",
                   hover_data=['Product Name','Retailer Product Name']) #hover_data selects the variable which appears when hovering over the data points
fig.show()

In [None]:
# use interactive histograms to identify outliers
fig = px.histogram(new_df.fillna(''), x='kcal/100g', y='Sugar/100g', color="Eatwell_Segment", marginal="rug",
                   hover_data=['Product Name','Retailer Product Name']) #hover_data selects the variable which appears when hovering over the data points
fig.show()