### Title: ingredient_prediction_Inverse_Cooking

__Date:__ 5/8/23

__Author:__ Jules Larke
 
__Purpose__  
Generate F1 scores for per meal image based on text descriptions predicted with the Inverse Cooking algorithm

__Required Input Files__

  - **master_SNAPME_linkfile.csv** - master file linking images with participant metadata (downloaded from Ag Data Commons)
  - **snapme_allimages.out** - Output from FB Inverse Cooking algorithm
  - **fndds_2018.csv** - Food and Nutrition Database for Dietary Studies 2017-2018 - used for ingredientization
  - **snapme_NA_replaced_inplace_100722.csv** - replaced discontinued foodcodes from previous version of FNDDS used to record diets via ASA24
  - **snapme_single_ingred_parsed_r1m.csv** - foods parsed into their ingredient from the R1M database  
  - **snapme_ingredientized_manual_fixes__tokens_cleaned_012623.csv** - Foods from ASA24 that could not be ingredientized, primarily baked goods. These items were matched to Branded Foods to get ingredient lists.

__Output__
- **snapme_inv_cook_f1_score_050823.csv** Will be the input for data visualization and statistics in f1_score_testing_visualization.R

In [1]:
# Load modules
import pandas as pd
import string
import nltk
#nltk.data.path.append('/Users/jules.larke/opt/anaconda3/)
import string
import re
pd.set_option('display.max_colwidth', 100)
wn = nltk.WordNetLemmatizer()

In [2]:
# Load input data
documents = pd.read_csv('../input/master_SNAPME_linkfile.csv')
labels = pd.read_csv('../input/snapme_allimages.out', sep=":", header=None)
fndds = pd.read_csv('../input/fndds2018.csv')

In [3]:
# Processed data to replace original data with discontinued foodcodes or ingredientized data R1M
snapme_to_merge = pd.read_csv('../input/snapme_NA_replaced_inplace_100722.csv') # walked the foodcode snapme nas forward to FNDDS17-18 to replace those discontinued foodcodes
r1m = pd.read_csv('../input/snapme_single_ingred_parsed_r1m.csv')

In [4]:
# Foods from ASA24 that could not be ingredientized, primarily baked goods. These items were matched to Branded Foods to get ingredient lists.
baked = pd.read_csv('../input/snapme_ingredientized_manual_fixes__tokens_cleaned_012623.csv')
baked = baked[baked['manual_fix'] == 'Y']

In [5]:
labels.rename(columns={0:'filename',1:'inverse_cook'}, inplace=True)

In [6]:
labels.filename = labels.filename.str.rstrip() # remove single whitespace at end of each string

In [7]:
labels.inverse_cook = labels.inverse_cook.str.lstrip()

In [8]:
fndds_multi = fndds.loc[fndds['Seq num'] >= 2]

In [9]:
fndds_multi = fndds[fndds['Food code'].isin(fndds_multi['Food code'])]

In [10]:
fndds_multi["single_multi"] = 'multi'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fndds_multi["single_multi"] = 'multi'


In [11]:
fndds_single = fndds[~fndds['Food code'].isin(fndds_multi['Food code'])]

In [12]:
fndds_single['single_multi'] = 'single'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fndds_single['single_multi'] = 'single'


In [13]:
fndds_sm = pd.concat([fndds_single, fndds_multi], axis=0)

In [14]:
fndds_sm.rename(columns={'Main food description':'Food_Description', 'Food code': 'FoodCode'}, inplace=True)

In [15]:
snap_one_row_filename = documents.groupby("filename").filter(lambda x: len(x) == 1) #rename to something differnt

In [17]:
snap_single_no_na = snap_one_row_filename[~snap_one_row_filename['filename'].isin(snapme_to_merge['filename'])]

In [18]:
snap_single_no_na = snap_single_no_na.loc[:,'subject_id':'Food_Description']

In [19]:
snap_single = pd.concat([snap_single_no_na, snapme_to_merge])

In [20]:
tmp = fndds_sm[['FoodCode', 'single_multi']]

In [21]:
tmp1 = tmp.drop_duplicates(subset='FoodCode')

In [22]:
snap_tmp = pd.merge(snap_single, tmp1, on='FoodCode', how='left')

In [23]:
r1m.drop(columns=['Food_Description', 'clean_punct', 'clean_text', 'title', 'ingredients'], inplace=True)

In [24]:
r1m.rename(columns={'parsed':'Food_Description'}, inplace=True)

In [25]:
snap_merge = pd.concat([snap_tmp, r1m], axis=0)

In [26]:
doc2 = documents[~documents['filename'].isin(snap_merge.filename)]

In [27]:
doc2 = doc2.loc[:, :'Food_Description'].dropna()

In [28]:
snap_all = pd.concat([snap_merge, doc2], axis=0)

In [29]:
# join all food_descriptions for a given image for ASA24 data:
docs = snap_all.groupby('filename')['Food_Description'].apply(', '.join).reset_index()

In [30]:
df_all = pd.merge(docs, labels, on='filename', how='right')

In [31]:
punct = string.punctuation[0:11] + string.punctuation[13:] # remove '-' from the list of punctuation. This is needed for the tokenizer in the following cell

In [32]:
stopwords = ['ns', 'nfs', '', 'dry', 'water', 'black', 'brown', 'bayo', 'cut', 'cooked', 'cooking', 'as', 'to', 'of', 'flavor', 'fat', 'eaten', 'made', 'with', 'raw', 'brewed', 'fried', 'eaten', 'toasted', 'boiled', 'from', 'fresh', 'type', 'coated', 'baked', 'or', 'broiled', 'part', 'and', 'method', 'skin', 'not', 'stewed', 'canned', 'table', 'reduced', 'fat', 'added', 'in', 'hot', 'granulated', 'ground', 'Mexican', 'blend', 'flavored',  'home', 'recipe', 'purchased', 'c', 'at', 'a', 'bakery', 'replacement', 'free', 'powder', '0', '1', '2', '20', '99', '100']
stopwords = stopwords # + drop # added words from list of the intersection of unique words 
def clean_text(text):
    text = "".join([word for word in text if word not in punct])
    tokens = re.split('[-\W+]', text)
    text = [word for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return set(text)

df_all['clean_description'] = df_all['Food_Description'].apply(lambda x: clean_text(x.lower()))
df_all['clean_inv_cook'] = df_all['inverse_cook'].apply(lambda x: clean_text(x.lower()))


#### determine stopwords to remove based on tokens that do not specify a food/ingredient

In [33]:
snapme_list = [item for sublist in df_all['clean_description'] for item in sublist]

In [34]:
snapme_unique = list(set(snapme_list))

In [35]:
inv_list = [item for sublist in df_all['clean_inv_cook'] for item in sublist]

In [36]:
inv_unique = list(set(inv_list))

In [37]:
intersect = list(set(snapme_list) & set(inv_list))

In [38]:
baked_keep = ['garbanzo', 'turmeric', 'tapioca', 'mango', 'caraway', 'mulberry', 'poppy', 'goji', 'currant',
             'barley', 'cottonseed', 'carob', 'chicory', 'cacao', 'semolina', 'jalapeno', 'cherry', 'zucchini']
# list of tokens to keep from the branded foods list of ingredients
# these tokens consisty only of words that specifically describe a food item

In [39]:
snapme_only = set(snapme_unique) - set(intersect)

In [40]:
keep = ['applesauce', 'agave', 'alfalfa', 'artichoke', 'bagel', 'beer', 'beet',
        'batter', 'berry', 'biryani', 'biscuit', 'boysenberry', 'brownie', 'bratwurst', 'buckwheat',
        'bulgur', 'burger', 'burrito', 'catfish', 'cake', 'catsup', 'cayenne', 'chex', 'consomme',
        'cheddar', 'cheerio', 'cheeseburger', 'cheetos', 'chia', 'chickpea', 'chive', 'crab', 'cottage',
        'cider', 'clam', 'crepe', 'croissant', 'cod', 'coleslaw', 'collard', 'chowder', 'cashew', 'coriander',
        'cornbread', 'cornmeal', 'cornstarch', 'croquette', 'crouton', 'cupcake', 'crunchberries', 'cumin',
        'custard', 'dough', 'date', 'doughnut', 'dough', 'dill', 'doritos', 'drumstick', 'enchilada',
        'espresso', 'falafel', 'flatbread', 'flauta', 'frankfurter', 'frosting', 'fudge', 'fritter',
        'flaxseed', 'flax', 'focaccia', 'gingersnap', 'gnocchi', 'groat', 'guacamole', 'guava',
        'goldfish', 'gouda', 'graham', 'granola', 'grapefruit', 'gravy', 'grit', 'huckleberry',
        'hamburger', 'icing','hazelnut', 'horchata', 'hummus', 'jello', 'jelly', 'jicama',
        'jambalaya', 'kimchi', 'lard', 'lamb', 'latte', 'leek', 'licorice', 'lobster', 'margarine',
        'macadamia', 'marmalade', 'minestrone', 'meatball', 'meringue', 'naan', 'nacho', 'nectarine',
        'mussel', 'meat', 'muffin', 'mozzarella', 'muesli', 'muenster', 'omelet', 'oatmeal', 'okra',
        'omelet', 'oyster', 'oolong', 'oregano', 'pancake', 'parmesan', 'parsnip', 'pastry', 'persimmon',
        'pesto', 'paprika', 'pie', 'pine', 'pinto', 'pistachio', 'pita', 'pistashio', 'pizza', 'plantain',
        'plum', 'pretzel', 'provolone', 'pudding',   'pomegranate',  'pickle', 'pastry', 'quesadilla',
        'quiche', 'rapeseed', 'ranch', 'ravioli', 'ravioli', 'ricotta', 'roquefort', 'ritz', 'romaine',
        'roquefort', 'rose', 'rye', 'safflower', 'sage', 'salad', 'salami', 'sandwich', 'sauerkraut',
        'scone', 'seaweed', 'sesame', 'shallot', 'seafood', 'serrano', 'shortening', 'smoothie', 'snowpea',
        'snowpeas', 'soda', 'soy', 'sparerib', 'stevia', 'soybean', 'soup', 'stroganoff', 'sushi', 'stuffing',
        'sparerib', 'swiss', 'stew', 'swordfish', 'taco', 'tamale', 'tamari', 'taquito', 'tostada', 'thyme',
        'tuna', 'turkey', 'tzatziki', 'vegetable', 'triscuit', 'tangerine', 'teriyaki', 'turnip', 'tilapia',
        'tiramisu', 'tortellini', 'turnover', 'whopper', 'waffle', 'walnut', 'wheat', 'whiskey', 'wonton',
        'worcestershire', 'yolk'] + baked_keep
# words that are distinct food items to keep 
## using this list as stopwords lowers average F1 score

In [41]:
drop = list(set(snapme_only) - set(keep))

#### apply text cleaning

In [42]:
stopwords = ['ns', 'nfs', '', 'water', 'dry', 'black', 'brown', 'bayo', 'cut', 'cooked', 'cooking', 'as', 'to', 'of', 'flavor', 'fat', 'eaten', 'made', 'with', 'raw', 'brewed', 'fried', 'eaten', 'toasted', 'boiled', 'from', 'fresh', 'type', 'coated', 'baked', 'or', 'broiled', 'part', 'and', 'method', 'skin', 'not', 'stewed', 'canned', 'table', 'reduced', 'fat', 'added', 'in', 'hot', 'granulated', 'ground', 'Mexican', 'blend', 'flavored',  'home', 'recipe', 'purchased', 'c', 'at', 'a', 'bakery', 'replacement', 'free', 'powder', '0', '1', '2', '20', '99', '100']
stopwords = stopwords + drop # added words from list of the intersection of unique words 
def clean_text(text):
    text = "".join([word for word in text if word not in punct])
    tokens = re.split('[-\W+]', text)
    text = [word for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return set(text)

df_all['clean_description'] = df_all['Food_Description'].apply(lambda x: clean_text(x.lower()))
df_all['clean_inv_cook'] = df_all['inverse_cook'].apply(lambda x: clean_text(x.lower()))

#### F1 scoring function

In [43]:
def string_detection(a,b):
    tp = sum(any(m == L for m in a) for L in b)
    fn = len(a) - tp
    fp = len(b) - sum(any(m == L for m in b) for L in a)
    try:
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2 * precision * recall)/(precision + recall)
        return(f1)
    except ZeroDivisionError:
        return 0

In [45]:
df_all.set_index('filename', inplace=True)

In [46]:
baked.set_index('filename', inplace=True)

In [47]:
baked['Full_Ingredient_List_clean'] = baked['Full_Ingredient_List'].apply(lambda x: clean_text(x))

In [48]:
df_all['clean_description'].update(baked.Full_Ingredient_List_clean)

In [49]:
df_all.reset_index(inplace=True)

In [50]:
result = [string_detection(x, y) for x, y in zip(df_all['clean_description'], df_all['clean_inv_cook'])]

In [51]:
df_all['F1_score'] = result

In [52]:
df_all.F1_score.mean() 

0.22989004263257837

In [53]:
baked.reset_index(inplace=True)

#### create metadata for which database ingredients were parsed from

In [54]:
df_all['ingred_parsed_from'] = 'FNDDS1718'

In [55]:
df_all.loc[df_all.filename.isin(r1m.filename), ['ingred_parsed_from']] = 'R1M'

In [56]:
df_all.loc[df_all.filename.isin(baked.filename), ['ingred_parsed_from']] = 'BFDB'

In [57]:
doc_meta = documents[['filename', 'FoodCode', 'Occ_Name']]

In [58]:
tmp_1 = pd.merge(df_all, doc_meta, how='inner', on='filename')

In [59]:
tmp_1['FoodCode'] = tmp_1['FoodCode'].astype('str')

#### metadata for counting number of foodcodes, images...

In [60]:
tmp_1['food_count'] = tmp_1[['filename','FoodCode']].groupby(['filename'])['FoodCode'].transform(lambda x: x[x.str.contains('0|1|2|3|4|5|6|7|8|9')].count())

In [61]:
tmp_1['image_count'] = tmp_1[['filename','FoodCode']].groupby(['filename'])['FoodCode'].transform(lambda x: x[x.str.contains('0|1|2|3|4|5|6|7|8|9')].count())

In [62]:
tmp_1['multi_codes'] = tmp_1[['filename','FoodCode']].groupby(['filename'])['FoodCode'].transform(lambda x: ', '.join(x))


In [63]:
tmp_1 = tmp_1[['filename','multi_codes', 'food_count', 'Occ_Name']].drop_duplicates()

In [64]:
snap_pred_f1 = pd.merge(df_all, tmp_1, on='filename', how='left').drop_duplicates(subset='filename')
snap_pred_f1.shape

(1463, 10)

#### recode eating occasion and bin foodcodes to 1, 2-3 or 4+

In [65]:
def occ_groups(series):
    if series == 1:
        return "Breakfast"
    elif series == 2:
        return "Brunch"
    elif series == 3:
        return "Lunch"
    elif series == 4:
        return "Dinner"
    elif series == 6:
        return "Snack"
    elif series == 7:
        return "Drink"
    elif series == 8:
        return "Supplement"

snap_pred_f1['Occ_Name'] = snap_pred_f1['Occ_Name'].apply(occ_groups)

In [66]:
def food_count_groups(series):
    if series in range(1,2):
        return "1"
    elif series in range(2,4):
        return "2-3"
    elif series in range(4,18):
        return "4+"

snap_pred_f1['food_count_groups'] = snap_pred_f1['food_count'].apply(food_count_groups)

#### get counts for metadata

In [67]:
snap_pred_f1[['Occ_Name']].value_counts()

Occ_Name 
Breakfast    496
Dinner       342
Lunch        300
Snack        231
Drink         81
Brunch         5
dtype: int64

In [68]:
snap_pred_f1[['food_count_groups']].value_counts()

food_count_groups
1                    551
2-3                  467
4+                   445
dtype: int64

In [69]:
snap_pred_f1['coffee_tea_count'] = snap_pred_f1.groupby(['filename', 'Occ_Name'])['Food_Description'].transform(lambda x: x[x.str.contains('Coffee|Tea', case=False)].count())

In [70]:
snap_pred_f1[['Occ_Name', 'coffee_tea_count']].value_counts(ascending=True)

Occ_Name   coffee_tea_count
Brunch     0.0                   5
Lunch      1.0                  12
Snack      1.0                  12
Drink      0.0                  14
Dinner     1.0                  18
Drink      1.0                  67
Breakfast  1.0                 114
Snack      0.0                 219
Lunch      0.0                 288
Dinner     0.0                 324
Breakfast  0.0                 382
dtype: int64

In [71]:
snap_pred_f1.to_csv('../otuput/snapme_inv_cook_f1_score_050823.csv', index=None)