In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
[K     |████████████████████████████████| 400.7 MB 6.4 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
import en_core_web_lg
import pandas as pd
import re
import random
import spacy
from spacy.util import minibatch, compounding
import warnings
import matplotlib.pyplot as plt

In [24]:
food_df = pd.read_csv("/content/drive/MyDrive/job work/food.csv")
food_df.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,1105904,branded_food,WESSON Vegetable Oil 1 GAL,,2020-11-13
1,1105905,branded_food,SWANSON BROTH BEEF,,2020-11-13
2,1105906,branded_food,CAMPBELL'S SLOW KETTLE SOUP CLAM CHOWDER,,2020-11-13
3,1105907,branded_food,CAMPBELL'S SLOW KETTLE SOUP CHEESE BROCCOLI,,2020-11-13
4,1105898,experimental_food,Discrepancy between the Atwater factor predict...,,2020-10-30


In [25]:
food_df["description"].size

1687318

In [26]:
foods = food_df[food_df["description"].str.contains("[^a-zA-Z ]") == False]["description"].apply(lambda food: food.lower())

In [27]:
foods = foods[foods.str.split().apply(len) <= 3].drop_duplicates()

In [28]:
foods.size

41634

In [29]:
foods

1               swanson broth beef
9            swanson broth chicken
31         pepperidge farm cookies
41           pepperidge farm bread
48         swanson broth vegetable
                    ...           
1678853               honey liquor
1678854            blackberry beer
1678976                  pignolias
1679000          spooky rope candy
1679012        edible zombie slime
Name: description, Length: 41634, dtype: object

In [30]:
one_worded_foods = foods[foods.str.split().apply(len) == 1]
two_worded_foods = foods[foods.str.split().apply(len) == 2]
three_worded_foods = foods[foods.str.split().apply(len) == 3]

In [37]:
one_worded_foods.size

1453

In [35]:
two_worded_foods.size

14022

In [36]:
three_worded_foods.size

26159

In [38]:
foods = one_worded_foods.append(two_worded_foods[:1000])

In [40]:
foods = foods.append(three_worded_foods[:1000])

In [43]:
food_templates = [
    "I ate my {}",
    "I'm eating a {}",
    "I just ate a {}",
    "I only ate the {}",
    "I'm done eating a {}",
    "I've already eaten a {}",
    "I just finished my {}",
    "When I was having lunch I ate a {}",
    "I had a {} and a {} today",
    "I ate a {} and a {} for lunch",
    "I made a {} and {} for lunch",
    "I ate {} and {}",
    "today I ate a {} and a {} for lunch",
    "I had {} with my husband last night",
    "I brought you some {} on my birthday",
    "I made {} for yesterday's dinner",
    "last night, a {} was sent to me with {}",
    "I had {} yesterday and I'd like to eat it anyway",
    "I ate a couple of {} last night",
    "I had some {} at dinner last night",
    "Last night, I ordered some {}",
    "I made a {} last night",
    "I had a bowl of {} with {} and I wanted to go to the mall today",
    "I brought a basket of {} for breakfast this morning",
    "I had a bowl of {}",
    "I ate a {} with {} in the morning",
    "I made a bowl of {} for my breakfast",
    "There's {} for breakfast in the bowl this morning",
    "This morning, I made a bowl of {}",
    "I decided to have some {} as a little bonus",
    "I decided to enjoy some {}",
    "I've decided to have some {} for dessert",
    "I had a {}, a {} and {} at home",
    "I took a {}, {} and {} on the weekend",
    "I ate a {} with {} and {} just now",
    "Last night, I ate an {} with {} and {}",
    "I tasted some {}, {} and {} at the office",
    "There's a basket of {}, {} and {} that I consumed",
    "I devoured a {}, {} and {}",
    "I've already had a bag of {}, {} and {} from the fridge"
]

In [44]:
TRAIN_FOOD_DATA = {
    "one_food": [],
    "two_foods": [],
    "three_foods": []
}

TEST_FOOD_DATA = {
    "one_food": [],
    "two_foods": [],
    "three_foods": []
}

FOOD_SENTENCE_LIMIT = 167

def get_food_data(count):
    return {
        1: TRAIN_FOOD_DATA["one_food"] if len(TRAIN_FOOD_DATA["one_food"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["one_food"],
        2: TRAIN_FOOD_DATA["two_foods"] if len(TRAIN_FOOD_DATA["two_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["two_foods"],
        3: TRAIN_FOOD_DATA["three_foods"] if len(TRAIN_FOOD_DATA["three_foods"]) < FOOD_SENTENCE_LIMIT else TEST_FOOD_DATA["three_foods"],
    }[count]

pattern_to_replace = "{}"

foods = foods.sample(frac=1)

food_entity_count = foods.size - 1

while food_entity_count >= 2:
    entities = []

    sentence = food_templates[random.randint(0, len(food_templates) - 1)]

    matches = re.findall(pattern_to_replace, sentence)

    for match in matches:
        food = foods.iloc[food_entity_count]
        food_entity_count -= 1

        sentence = sentence.replace(match, food, 1)
        match_span = re.search(food, sentence).span()

        entities.append((match_span[0], match_span[1], "FOOD"))

    get_food_data(len(matches)).append((sentence, {"entities": entities}))

In [47]:
TRAIN_FOOD_DATA

{'one_food': [('I had french baguette with my husband last night',
   {'entities': [(6, 21, 'FOOD')]}),
  ('This morning, I made a bowl of waffle fries',
   {'entities': [(31, 43, 'FOOD')]}),
  ('I brought you some chunky crushed tomatoes on my birthday',
   {'entities': [(19, 42, 'FOOD')]}),
  ("I'm done eating a uncured ham", {'entities': [(18, 29, 'FOOD')]}),
  ('I had broccoli cuts with my husband last night',
   {'entities': [(6, 19, 'FOOD')]}),
  ('I just ate a premium mangos', {'entities': [(13, 27, 'FOOD')]}),
  ('I ate my caponata', {'entities': [(9, 17, 'FOOD')]}),
  ("I made kosher dill pickles for yesterday's dinner",
   {'entities': [(7, 26, 'FOOD')]}),
  ('I only ate the passion flower', {'entities': [(15, 29, 'FOOD')]}),
  ("There's reddiegg for breakfast in the bowl this morning",
   {'entities': [(8, 16, 'FOOD')]}),
  ("I made smoky bbq sauce for yesterday's dinner",
   {'entities': [(7, 22, 'FOOD')]}),
  ("There's puffins for breakfast in the bowl this morning",
   {'

In [49]:
import json
    
with open("TRAIN_FOOD_DATA(1).json", "w") as outfile:
    json.dump(TRAIN_FOOD_DATA, outfile)

In [50]:
import json
    
with open("TEST_FOOD_DATA(1).json", "w") as outfile:
    json.dump(TEST_FOOD_DATA, outfile)