### Title: ingredient_prediction_Im2Recipe

__Date:__ 5/8/23

__Author:__ Jules Larke
 
__Purpose__  
Generate F1 scores for per meal image based on text descriptions predicted with the Im2Recipe algorithm

__Required Input Files__

  - **snapme_result_clean.json** - Ingredient prediction output using im2recipe model with SNAPMe dataset (before photos) from Donghee Lee
  - **snapme_inv_cook_f1_score_050823.csv** - Output from ingredient_prediction_Inverse_Cooking.ipynb. Used to .loc the image filenames so the same data is being used. Also, links metadata; food_count, Occ_Name and single_multi

__Output__
- **snapme_im2r_f1_score_050823.csv** Will be the input for data visualization and statistics in f1_score_testing_visualization.R

In [1]:
# Load modules
import pandas as pd
import string
import nltk
import string
import re
pd.set_option('display.max_colwidth', 100)
wn = nltk.WordNetLemmatizer()

In [3]:
# Load data
im2r = pd.read_json("im2recipe/snapme_clean/snapme_result_clean.json")
fb_inv_res = pd.read_csv('snapme_inv_cook_f1_score_050823.csv')

In [4]:
fb_inv_res = fb_inv_res[['Food_Description', 'filename', 'food_count', 'Occ_Name']]

In [5]:
im2r = im2r.transpose().reset_index().rename(columns={'index': 'filename'})

In [6]:
im2r.filename = im2r.filename + '.jpeg'

In [7]:
im2r = im2r.loc[im2r.filename.isin(fb_inv_res.filename)]

In [8]:
im2r_2 = pd.merge(im2r, fb_inv_res, on='filename')

In [9]:
im2r_2 = im2r_2[['filename', 'Food_Description', 'top1', 'food_count', 'Occ_Name']]

In [10]:
# Generated list of stopwords based on tokens from both FNDDS and R1M that did not provide any information that could be used to identify a specific food
stopwords = ['ns',
 'nfs',
 '',
 'water',
 'dry',
 'black',
 'brown',
 'bayo',
 'cut',
 'cooked',
 'cooking',
 'as',
 'to',
 'of',
 'flavor',
 'fat',
 'eaten',
 'made',
 'with',
 'raw',
 'brewed',
 'fried',
 'eaten',
 'toasted',
 'boiled',
 'from',
 'fresh',
 'type',
 'coated',
 'baked',
 'or',
 'broiled',
 'part',
 'and',
 'method',
 'skin',
 'not',
 'stewed',
 'canned',
 'table',
 'reduced',
 'fat',
 'added',
 'in',
 'hot',
 'granulated',
 'ground',
 'Mexican',
 'blend',
 'flavored',
 'home',
 'recipe',
 'purchased',
 'c',
 'at',
 'a',
 'bakery',
 'replacement',
 'free',
 'powder',
 '0',
 '1',
 '2',
 '20',
 '99',
 '100',
 'stuffed',
 'low',
 'puerto',
 'nutritional',
 'trail',
 'valley',
 'shelf',
 'brussels',
 'hash',
 'd',
 'purpose',
 'weed',
 'sauce',
 'lactose',
 'spread',
 'tap',
 'mini',
 'morsel',
 'based',
 'citrus',
 'boiling',
 'capn',
 'dip',
 'white',
 'ripe',
 'pot',
 'indian',
 'chinese',
 'wild',
 'carbonated',
 'sun',
 'cane',
 'higher',
 'french',
 'loaf',
 'mix',
 'calorie',
 'congee',
 'le',
 'american',
 'wing',
 'grilled',
 'tartar',
 'maple',
 'wedding',
 'kellogg',
 'quick',
 'mixed',
 'pickled',
 'devil',
 'refried',
 'breast',
 'decaffeinated',
 'mashed',
 'puffed',
 'substitute',
 'solid',
 'place',
 'leavening',
 'peel',
 'coating',
 'sunflower',
 'filled',
 'mein',
 'caesar',
 'topping',
 'on',
 'pilaf',
 'mostly',
 'cured',
 'thigh',
 'lightly',
 'round',
 'hydrogenated',
 'dessert',
 'melted',
 'jack',
 'andor',
 'carton',
 'monterey',
 'cracked',
 'active',
 'tender',
 'classic',
 'plain',
 'filling',
 'k',
 'new',
 'colby',
 'fry',
 'containing',
 'used',
 'barbecue',
 'light',
 'rican',
 'calcium',
 'floured',
 'fast',
 'crispbread',
 'drink',
 'lump',
 'scrambled',
 'source',
 'dark',
 'grecian',
 'uncooked',
 'sauteed',
 'air',
 'snack',
 'loop',
 'english',
 'string',
 'organic',
 'path',
 'winter',
 'semi',
 'greek',
 'crude',
 'woven',
 'dipped',
 'vienna',
 'edamame',
 'leg',
 'homemade',
 'cone',
 'unsweetened',
 'summer',
 'sugared',
 'nut',
 'palm',
 'feta',
 'eat',
 'spanish',
 'heavy',
 'generic',
 'herbal',
 'nonfat',
 'liquid',
 'chop',
 'squeezed',
 'ruffled',
 'braised',
 'griddle',
 'fryer',
 'hard',
 'unprepared',
 'loin',
 'great',
 'frosted',
 'enriched',
 'sprouted',
 'evaporated',
 'sweetener',
 'plus',
 'bouillon',
 'pho',
 '80',
 'kidney',
 'instant',
 'concentrate',
 'average',
 'grated',
 'flake',
 'includes',
 'distilled',
 'cafe',
 'sulfate',
 'prepackaged',
 'ascorbic',
 'armenian',
 'boston',
 'alcohol',
 'medium',
 'paste',
 'dairy',
 'slice',
 'packaged',
 'jam',
 'iced',
 'acid',
 'pre',
 'other',
 'whipped',
 'creamer',
 'ea',
 'edam',
 'serve',
 'pattie',
 'gallo',
 'confectionery',
 'it',
 'general',
 'real',
 'tenderloin',
 'tub',
 'bit',
 'restaurant',
 'smoked',
 'reconstituted',
 'kettle',
 'creamed',
 'unsalted',
 'mocha',
 'two',
 'scalloped',
 'canola',
 'household',
 'whey',
 'bunch',
 'chai',
 'unroasted',
 'kernel',
 'one',
 'dried',
 'japanese',
 'joe',
 'gluten',
 'meatless',
 'breaded',
 'non',
 'pressurized',
 'queso',
 'submarine',
 'rolo',
 'thin',
 'froot',
 'sour',
 'square',
 'for',
 'agent',
 'luncheon',
 'fresco',
 'mackerel',
 'grain',
 'microwave',
 'deglet',
 'leaf',
 'thins',
 'bottled',
 'color',
 'restructured',
 'roasted',
 'tart',
 'patty',
 'dumpling',
 'including',
 'drained',
 'stable',
 'pico',
 'sweet',
 'fluid',
 'blend',
 'flavor',
 'regular',
 'protein',
 'mature',
 'sodium',
 'preserve',
 'hawaiian',
 'condiment',
 'cultured',
 'spray',
 'crunch',
 'baker',
 'sloppy',
 'toast',
 'usda',
 'mexican',
 'dressing',
 'frozen',
 'thai',
 'without',
 'lean',
 'unbuttered',
 'calico',
 'crumb',
 'style',
 'only',
 'food',
 'year',
 'rotisserie',
 'animal',
 'whole',
 'strip',
 'brick',
 'popped',
 'puff',
 'complete',
 'leafy',
 'form',
 'con',
 'deep',
 'excluding',
 'sulfured',
 'casserole',
 'spartan',
 'steamed',
 'shell',
 'beverage',
 'fruit',
 'vegetarian',
 'boneless',
 'lite',
 'breakfast',
 'chard',
 'heat',
 'kashi',
 'composite',
 'germ',
 'double',
 'quaker',
 'jr',
 'young',
 'goat',
 'reeses',
 'alfredo',
 'than',
 'semisweet',
 'about',
 'unenriched',
 'marinade',
 'baking',
 'chow',
 'ready',
 'poultry',
 'cheez',
 'soft',
 'mung',
 'high',
 'multigrain',
 'farmer',
 'diet',
 'lowfat',
 'italian',
 'major',
 'no',
 'verde',
 'bay',
 'yellow',
 'commercially',
 'brie',
 'mill',
 'degermed',
 'balsamic',
 'root',
 'red',
 'nature',
 'covered',
 'poached',
 'raised',
 'california',
 'prepared',
 'freshly',
 'blue',
 'post',
 'cotija',
 'golean',
 'glazed',
 'processed',
 'microwaving',
 'salted',
 'dripping',
 'aluminum',
 'powdered',
 'shoulder',
 'broiler',
 'all',
 'creamy',
 'lower',
 '011213162966',
 'mush',
 'like',
 'carne',
 'cappuccino',
 'half',
 'spice',
 'noor',
 'crust',
 'de',
 'canadian',
 'base',
 'thick',
 'pad',
 'cutlet',
 'bar',
 'condensed',
 'snap',
 'caffeine',
 'hoisin',
 'sweetened',
 'upc',
 'nugget',
 'cup',
 'king',
 'sponge',
 'sprinkle',
 'commodity',
 'small',
 'curd',
 'bran',
 'link',
 'combination',
 'england',
 'deli',
 'acting',
 'dog',
 'skinless',
 'special',
 'smooth',
 'hoagie',
 'skim',
 'vitamin']

In [11]:
punct = string.punctuation[0:11] + string.punctuation[13:] # remove '-' from the list of punctuation. This is needed for the tokenizer in the following cell
def clean_text(text):
    text = "".join([word for word in text if word not in punct])
    tokens = re.split('[-\W+]', text)
    text = [word for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return set(text)

im2r_2['clean_description'] = im2r_2['Food_Description'].apply(lambda x: clean_text(x.lower()))

In [12]:
im2r_2['top1'] = [','.join(map(str, l)) for l in im2r_2['top1']]

## Now clean text descriptions for Im2R 

In [13]:
def clean_text(text):
    text = "".join([word for word in text if word not in punct])
    tokens = re.split('[-\W+]', text)
    text = [word for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return set(text)

im2r_2['clean_im2r'] = im2r_2['top1'].apply(lambda x: clean_text(x.lower()))

In [14]:
def string_detection(a,b):
    tp = sum(any(m == L for m in a) for L in b)
    fn = len(a) - tp
    fp = len(b) - sum(any(m == L for m in b) for L in a)
    try:
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2 * precision * recall)/(precision + recall)
        return(f1)
    except ZeroDivisionError:
        return 0

In [15]:
result = [string_detection(x, y) for x, y in zip(im2r_2['clean_description'], im2r_2['clean_im2r'])]

In [16]:
im2r_2['F1_score'] = result

In [17]:
im2r_2.to_csv('../output/snapme_im2r_f1_score_050823.csv', index=None)