# Natural Language Process on Dataset

In [165]:
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
from nltk.tokenize import MWETokenizer
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy
nlp = spacy.load("en_core_web_sm")
from collections import Counter

# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\gghan\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


## Import Files

In [166]:
food_protein =  pd.read_csv (r'C:\Users\gghan\OneDrive\Desktop\CapstoneProject\ProcessedData\food_protein.csv')
food_nutrient =  pd.read_csv (r'C:\Users\gghan\OneDrive\Desktop\CapstoneProject\ProcessedData\food_nutrient_processed.csv')

In [167]:
food_protein.head()

Unnamed: 0.1,Unnamed: 0,fdc_id,data_type,description,food_category_id,protein_value,fat_value,carbohydrate_value
0,0,167518,sr_legacy_food,"Waffle, buttermilk, frozen, ready-to-heat, mic...",,4.0,9.0,4.0
1,1,167532,sr_legacy_food,"Bread, white wheat",,4.0,9.0,4.0
2,2,167537,sr_legacy_food,"Snacks, corn-based, extruded, chips, plain",,2.7,8.8,4.0
3,3,167538,sr_legacy_food,"Snacks, corn-based, extruded, chips, barbecue-...",,2.7,8.7,3.9
4,4,167539,sr_legacy_food,"Snacks, corn-based, extruded, cones, plain",,3.5,8.6,4.1


In [168]:
food_nutrient.head()

Unnamed: 0.1,Unnamed: 0,id,fdc_id,nutrient_id,description,amount,derivation_id,data_type,food_category_id,name,unit_name,nutrient_nbr,rank
0,0,1283674,167512,1003,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",5.88,46.0,sr_legacy_food,,Protein,G,203.0,600.0
1,1,1283688,167513,1003,"Pillsbury, Cinnamon Rolls with Icing, refriger...",4.34,47.0,sr_legacy_food,,Protein,G,203.0,600.0
2,2,1283712,167514,1003,"Kraft Foods, Shake N Bake Original Recipe, Coa...",6.1,1.0,sr_legacy_food,,Protein,G,203.0,600.0
3,3,1283725,167515,1003,"George Weston Bakeries, Thomas English Muffins",8.0,47.0,sr_legacy_food,,Protein,G,203.0,600.0
4,4,1283760,167516,1003,"Waffles, buttermilk, frozen, ready-to-heat",6.58,1.0,sr_legacy_food,,Protein,G,203.0,600.0


In [169]:
for food in food_protein['description']:

    print(food)

Waffle, buttermilk, frozen, ready-to-heat, microwaved
Bread, white wheat
Snacks, corn-based, extruded, chips, plain
Snacks, corn-based, extruded, chips, barbecue-flavor
Snacks, corn-based, extruded, cones, plain
Snacks, fruit leather, pieces
Snacks, fruit leather, rolls
Snacks, granola bars, hard, plain
Snacks, granola bars, hard, almond
Snacks, granola bars, soft, uncoated, raisin
Snacks, granola bars, soft, coated, milk chocolate coating, chocolate chip
Candies, honey-combed, with peanut butter
Snacks, granola bars, soft, coated, milk chocolate coating, peanut butter
Snacks, granola bars, soft, uncoated, peanut butter and chocolate chip
Snacks, popcorn, oil-popped, microwave, regular flavor, no trans fat
Snacks, popcorn, cakes
Snacks, popcorn, caramel-coated, with peanuts
Snacks, popcorn, caramel-coated, without peanuts
Snacks, potato chips, made from dried potatoes, reduced fat
Snacks, potato chips, made from dried potatoes, sour-cream and onion-flavor
Snacks, pretzels, hard, plain,

Cereals, QUAKER, corn grits, instant, plain, prepared (microwaved or boiling water added), without salt
Cereals, CREAM OF RICE, dry
Cereals, CREAM OF WHEAT, regular, 10 minute cooking, dry
Cereals, CREAM OF WHEAT, instant, prepared with water, without salt
Cereals, MALT-O-MEAL, original, plain, dry
Cereals, oats, regular and quick, not fortified, dry
Cereals, oats, regular and quick, unenriched, cooked with water (includes boiling and microwaving), without salt
Cereals ready-to-eat, rice, puffed, fortified
Cereals ready-to-eat, wheat, puffed, fortified
Cereals, CREAM OF RICE, cooked with water, with salt
Cereals, CREAM OF WHEAT, regular (10 minute), cooked with water, with salt
Cereals, farina, unenriched, dry
Cereals, farina, enriched, cooked with water, with salt
Apples, raw, without skin, cooked, boiled
Apples, raw, without skin, cooked, microwave
Apples, canned, sweetened, sliced, drained, heated
Apples, dehydrated (low moisture), sulfured, uncooked
Apples, frozen, unsweetened, hea

## Tokenization Functions

In [170]:
# Function for tokenization

def food_tokenize(document):
    word_list = []
    analyzed = nlp(document)
    # Loop through the word list
    for token in analyzed:
        if token.is_alpha and not token.is_stop:
            possible_add = token.lemma_.lower()       #Lemmatize and lowercase
            word_list.append(possible_add)
    return word_list

In [171]:
def multiword_tokenize(doc, num_words):
    multiword_list = []
    
    # Set stop words and lemmatizer
    stops = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # food_name = doc.partition(',')[0]
    # num_words = food_name.split()
    
    # Set up multiple word expressions
    mwe = MWETokenizer([num_words])
    doc_nocomma = doc.replace(',','')
    multi_analyzed = mwe.tokenize(doc_nocomma.split())
    
    #Loop through the word list
    for multi_token in multi_analyzed:
        if multi_token.isalpha and multi_token not in stops:
            # Lowercase
            possible_add = multi_token.lower()
            # Lemmatize
            possible_add = lemmatizer.lemmatize(possible_add)
            multiword_list.append(possible_add)
            
    return multiword_list
            
        

In [172]:
# Testing Function

# doc = 'Parmesan cheese topping, fat free'
# food_name = doc.partition(',')[0]
# num_words = food_name.split()
# multiword_tokenize(doc, num_words)

## Food Protein Data NLP

### Tokenize food names

In [173]:
# Convert into a dataframe filled with lower cased form lists
food_dataframe = pd.DataFrame(columns = ['food_tokens'])
delimiter = ','
for food in food_protein['description']:
    food_name = food.partition(',')[0]       #Split food descriptions by commas
    num_words = food_name.split()        # Create a list of words using food descriptions
    
    # First word phrase is the name of the food, if the word phrase only have one word, tokenize it. Otherwise, use multword_tokenize function.
    if num_words == 1:                           
        food = delimiter.join(food)
        food_token = food_tokenize(food)
        food_dataframe = food_dataframe.append({'food_tokens': food_token}, ignore_index = True)
    else:
        multifood_token = multiword_tokenize(food, num_words)
        food_dataframe = food_dataframe.append({'food_tokens': multifood_token}, ignore_index = True)


In [174]:
# Display the result of food tokens
for food in food_dataframe['food_tokens']:

    print(food)

['waffle', 'buttermilk', 'frozen', 'ready-to-heat', 'microwaved']
['bread', 'white', 'wheat']
['snack', 'corn-based', 'extruded', 'chip', 'plain']
['snack', 'corn-based', 'extruded', 'chip', 'barbecue-flavor']
['snack', 'corn-based', 'extruded', 'cone', 'plain']
['snack', 'fruit', 'leather', 'piece']
['snack', 'fruit', 'leather', 'roll']
['snack', 'granola', 'bar', 'hard', 'plain']
['snack', 'granola', 'bar', 'hard', 'almond']
['snack', 'granola', 'bar', 'soft', 'uncoated', 'raisin']
['snack', 'granola', 'bar', 'soft', 'coated', 'milk', 'chocolate', 'coating', 'chocolate', 'chip']
['candy', 'honey-combed', 'peanut', 'butter']
['snack', 'granola', 'bar', 'soft', 'coated', 'milk', 'chocolate', 'coating', 'peanut', 'butter']
['snack', 'granola', 'bar', 'soft', 'uncoated', 'peanut', 'butter', 'chocolate', 'chip']
['snack', 'popcorn', 'oil-popped', 'microwave', 'regular', 'flavor', 'trans', 'fat']
['snack', 'popcorn', 'cake']
['snack', 'popcorn', 'caramel-coated', 'peanut']
['snack', 'popco

['veal', 'shoulder', 'blade', 'chop', 'separable', 'lean', 'raw']
['veal', 'shoulder', 'blade', 'separable', 'lean', 'cooked', 'braised']
['veal', 'sirloin', 'separable', 'lean', 'raw']
['veal', 'sirloin', 'separable', 'lean', 'cooked', 'braised']
['veal', 'sirloin', 'separable', 'lean', 'cooked', 'roasted']
['veal', 'cubed', 'stew', '(leg', 'shoulder)', 'separable', 'lean', 'raw']
['game_meat', 'antelope', 'cooked', 'roasted']
['game_meat', 'bear', 'raw']
['game_meat', 'bear', 'cooked', 'simmered']
['game_meat', 'beefalo', 'composite', 'cut', 'cooked', 'roasted']
['game_meat', 'bison', 'separable', 'lean', 'raw']
['game_meat', 'bison', 'separable', 'lean', 'cooked', 'roasted']
['game_meat', 'caribou', 'raw']
['game_meat', 'caribou', 'cooked', 'roasted']
['game_meat', 'deer', 'raw']
['bologna', 'pork']
['bologna', 'turkey']
['chicken_spread']
['corned_beef_loaf', 'jellied']
['dutch_brand_loaf', 'chicken', 'pork', 'beef']
['frankfurter', 'beef', 'unheated']
['ham', 'sliced', 'regular', 

In [175]:
print(f'Total number of Observations: {len(food_dataframe.index)}') 

Total number of Observations: 4771


### Update NLP results to the data

#### Select first tokens as food names

In [176]:
processed_des = pd.DataFrame(columns = ['food_name'])
for food in food_dataframe['food_tokens']:
    processed_food = food[0]
    processed_des = processed_des.append({'food_name': processed_food}, ignore_index = True)

In [177]:
for food in processed_des['food_name']:

    print(food)

waffle
bread
snack
snack
snack
snack
snack
snack
snack
snack
snack
candy
snack
snack
snack
snack
snack
snack
snack
snack
snack
snack
snack
snack
snack
candy
candy
baking_chocolate
candy
candy
candy
ice_cream
ice_cream
ice_cream
ice_cream
sherbet
candy
sweet_potatoes
deer_(venison)
restaurant
restaurant
restaurant
beverage
frozen_novelties
babyfood
sausage
pork_sausage_rice_links
salad_dressing
oil
beverage
beverage
pork
babyfood
babyfood
frozen_novelties
mayonnaise_dressing
pie_fillings
beverage
pudding
pudding
syrup
lemon
lemon_juice
lemon_juice_from_concentrate
prickly_pear
plum
plum
prune_juice
pummelo
raspberry
raspberry
raspberry
rhubarb
sapodilla
sapote
soursop
strawberry
tamarind
fruit_salad
watermelon
maraschino_cherry
pineapple
apricot
cherry
cherry
apple_juice
applesauce
applesauce
grapefruit_juice
apple_juice
abiyuch
rowal
guava_nectar
mango_nectar
tamarind_nectar
pomegranate_juice
nance
nance
naranjilla_(lulo)_pulp
horned_melon_(kiwano)
orange_juice
fruit_juice_smoothie
fru

fish
fish
fish
mollusk
mollusk
fish
lima_bean
lima_bean
lima_bean
lima_bean
mung_bean
mung_bean
noodle
mungo_beans
peanut
peanut
peanut
peanut
peanut
peanut_butter
peanut_butter
peanut_flour
meat_extender
sausage
soybean
tempeh
soy_flour
soy_flour
soy_flour
soy_protein_isolate
soy_sauce_made_from_soy_and_wheat_(shoyu)
soy_sauce_made_from_soy_(tamari)
soy_sauce_made_from_hydrolyzed_vegetable_protein
tofu
yardlong_beans
yardlong_beans
winged_bean
lentil
bean
bean
chickpea_flour_(besan)
hummus
tofu
tofu
peanut_butter
refried_beans
soybean
soybean
soy_protein_concentrate
soy_protein_isolate
tofu
tofu
tofu
yardlong_beans
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
game_meat
game_meat
game_meat
game_meat
game_meat
game_meat
lamb
veal
veal
veal
lamb
lamb
veal
veal
veal
lamb
lamb
veal
veal
veal
lamb
lamb
lamb
lamb
veal
veal
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
lamb
l

#### Update original dataset

In [178]:
food_name = processed_des['food_name']
food_protein = food_protein.join(food_name)

In [179]:
food_protein.head()

Unnamed: 0.1,Unnamed: 0,fdc_id,data_type,description,food_category_id,protein_value,fat_value,carbohydrate_value,food_name
0,0,167518,sr_legacy_food,"Waffle, buttermilk, frozen, ready-to-heat, mic...",,4.0,9.0,4.0,waffle
1,1,167532,sr_legacy_food,"Bread, white wheat",,4.0,9.0,4.0,bread
2,2,167537,sr_legacy_food,"Snacks, corn-based, extruded, chips, plain",,2.7,8.8,4.0,snack
3,3,167538,sr_legacy_food,"Snacks, corn-based, extruded, chips, barbecue-...",,2.7,8.7,3.9,snack
4,4,167539,sr_legacy_food,"Snacks, corn-based, extruded, cones, plain",,3.5,8.6,4.1,snack


#### Save to csv file

In [None]:
food_protein.to_csv('NLP_food_protein.csv')

### Raw food number test

In [180]:
df_raw = pd.DataFrame(columns = ['raw_food_tokens'])
for food in food_dataframe['food_tokens']:
    if 'raw' in food:
        df_raw = df_raw.append({'raw_food_tokens': food}, ignore_index = True)

In [181]:
for food in df_raw['raw_food_tokens']:

    print(food)

['deer_(venison)', 'sitka', 'raw', '(alaska', 'native)']
['lemon', 'raw', 'without', 'peel']
['lemon_juice', 'raw']
['prickly_pear', 'raw']
['pummelo', 'raw']
['raspberry', 'raw']
['rhubarb', 'raw']
['sapodilla', 'raw']
['sapote', 'mamey', 'raw']
['soursop', 'raw']
['strawberry', 'raw']
['tamarind', 'raw']
['watermelon', 'raw']
['grapefruit_juice', 'pink', 'raw']
['abiyuch', 'raw']
['rowal', 'raw']
['pork', 'fresh', 'composite', 'trimmed', 'leg', 'loin', 'shoulder', 'sparerib', '(includes', 'cut', 'cured)', 'separable', 'lean', 'fat', 'raw']
['pork', 'fresh', 'backfat', 'raw']
['pork', 'fresh', 'belly', 'raw']
['pork', 'fresh', 'separable', 'fat', 'raw']
['pork', 'fresh', 'leg', '(ham)', 'rump', 'half', 'separable', 'lean', 'fat', 'raw']
['pork', 'fresh', 'leg', '(ham)', 'rump', 'half', 'separable', 'lean', 'raw', '(includes', 'food', "usda's", 'food', 'distribution', 'program)']
['pork', 'fresh', 'loin', 'whole', 'separable', 'lean', 'fat', 'raw']
['pork', 'fresh', 'loin', 'blade', '(

In [182]:
print(f'raw food number: {len(df_raw.index)}') 

raw food number: 971


## Food Nutrient Data NLP

In [199]:
partial_food_nutrient =  pd.DataFrame(columns = ['fdc_id', 'data_type', 'description', 'food_category_id', 'protein_value', 'fat_value', 'carbohydrate_value'])
fdc_list = food_nutrient['fdc_id'].tolist()
len(fdc_list)
for fdc_id in fdc_list:
        if fdc_id not in food_protein['fdc_id']:
            
            food_nutrient = food_nutrient.drop(food_nutrient[food_nutrient['fdc_id'] == fdc_id].index)

KeyboardInterrupt: 

### Tokenize food names

In [197]:
# Convert into a dataframe filled with lower cased form lists
food_dataframe = pd.DataFrame(columns = ['food_tokens'])
delimiter = ','
for food in food_nutrient['description']:
    food_name = food.partition(',')[0]       #Split food descriptions by commas
    num_words = food_name.split()        # Create a list of words using food descriptions
    
    # First word phrase is the name of the food, if the word phrase only have one word, tokenize it. Otherwise, use multword_tokenize function.
    if num_words == 1:                           
        food = delimiter.join(food)
        food_token = food_tokenize(food)
        food_dataframe = food_dataframe.append({'food_tokens': food_token}, ignore_index = True)
    else:
        multifood_token = multiword_tokenize(food, num_words)
        food_dataframe = food_dataframe.append({'food_tokens': multifood_token}, ignore_index = True)

In [None]:
# Display the result of food tokens
for food in food2dataframe['food_tokens']:

    print(food)

In [None]:
print(f'Total number of Observations: {len(food_dataframe.index)}') 

### Update NLP results to the data

#### Select first tokens as food names

In [None]:
processed_des = pd.DataFrame(columns = ['food_name'])
for food in food_dataframe['food_tokens']:
    processed_food = food[0]
    processed_des = processed_des.append({'food_name': processed_food}, ignore_index = True)

In [None]:
for food in processed_des['food_name']:

    print(food)

#### Update original dataset

In [None]:
food_name = processed_des['food_name']
food_nutrient = food_protein.join(food_name)

In [None]:
food_nutrient.head()

#### Save to csv file

In [None]:
food_protein.to_csv('NLP_food_protein.csv')