# Natural Language Process on Dataset

In [1]:
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
import spacy
nlp = spacy.load("en_core_web_sm")
from collections import Counter

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\gghan\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


## Import Files

In [2]:
nlpdata =  pd.read_csv (r'C:\Users\gghan\OneDrive\Desktop\CapstoneProject\ProcessedData\food_protein.csv')
nlpdata.head()

Unnamed: 0.1,Unnamed: 0,fdc_id,data_type,description,food_category_id,protein_value,fat_value,carbohydrate_value
0,0,167518,sr_legacy_food,"Waffle, buttermilk, frozen, ready-to-heat, mic...",,4.0,9.0,4.0
1,1,167532,sr_legacy_food,"Bread, white wheat",,4.0,9.0,4.0
2,2,167537,sr_legacy_food,"Snacks, corn-based, extruded, chips, plain",,2.7,8.8,4.0
3,3,167538,sr_legacy_food,"Snacks, corn-based, extruded, chips, barbecue-...",,2.7,8.7,3.9
4,4,167539,sr_legacy_food,"Snacks, corn-based, extruded, cones, plain",,3.5,8.6,4.1


In [7]:
for food in nlpdata['description']:

    print(food)

Waffle, buttermilk, frozen, ready-to-heat, microwaved
Bread, white wheat
Snacks, corn-based, extruded, chips, plain
Snacks, corn-based, extruded, chips, barbecue-flavor
Snacks, corn-based, extruded, cones, plain
Snacks, fruit leather, pieces
Snacks, fruit leather, rolls
Snacks, granola bars, hard, plain
Snacks, granola bars, hard, almond
Snacks, granola bars, soft, uncoated, raisin
Snacks, granola bars, soft, coated, milk chocolate coating, chocolate chip
Candies, honey-combed, with peanut butter
Snacks, granola bars, soft, coated, milk chocolate coating, peanut butter
Snacks, granola bars, soft, uncoated, peanut butter and chocolate chip
Snacks, popcorn, oil-popped, microwave, regular flavor, no trans fat
Snacks, popcorn, cakes
Snacks, popcorn, caramel-coated, with peanuts
Snacks, popcorn, caramel-coated, without peanuts
Snacks, potato chips, made from dried potatoes, reduced fat
Snacks, potato chips, made from dried potatoes, sour-cream and onion-flavor
Snacks, pretzels, hard, plain,

Chicken, broilers or fryers, thigh, meat and skin, cooked, fried, flour
Chicken, broilers or fryers, thigh, meat and skin, cooked, roasted
Chicken, broilers or fryers, thigh, meat and skin, cooked, stewed
Chicken, broilers or fryers, dark meat, thigh, meat only, raw
Chicken, broilers or fryers, wing, meat and skin, cooked, fried, batter
Chicken, broilers or fryers, wing, meat and skin, cooked, fried, flour
Chicken, broilers or fryers, wing, meat and skin, cooked, roasted
Chicken, broilers or fryers, wing, meat and skin, cooked, stewed
Chicken, broilers or fryers, wing, meat only, raw
Chicken, roasting, meat and skin and giblets and neck, cooked, roasted
Chicken, roasting, meat and skin, cooked, roasted
Chicken, roasting, meat only, raw
Chicken, roasting, light meat, meat only, cooked, roasted
Chicken, roasting, dark meat, meat only, raw
Chicken, roasting, dark meat, meat only, cooked, roasted
Chicken, stewing, meat and skin, and giblets and neck, raw
Chicken, stewing, giblets, raw
Chic

## Tokenize Food Names

In [3]:
# Function for tokenization

def food_tokenize(document):
    word_list = []
    analyzed = nlp(document)
    for token in analyzed:
        if token.is_alpha and not token.is_stop:
            possible_add = token.lemma_.lower()
            word_list.append(possible_add)
    return word_list

In [4]:
# Convert into a dataframe filled with lower cased form lists
food_dataframe = pd.DataFrame(columns = ['food_tokens'])
delimiter = ','
for food in nlpdata['description']:
    food = delimiter.join(food)
    food_token = food_tokenize(food)
    food_dataframe = food_dataframe.append({'food_tokens': food_token}, ignore_index = True)


In [6]:
# Display the result of food tokens
for food in food_dataframe['food_tokens']:

    print(food)

['waffle', 'buttermilk', 'frozen', 'ready', 'heat', 'microwave']
['bread', 'white', 'wheat']
['snack', 'corn', 'base', 'extrude', 'chip', 'plain']
['snack', 'corn', 'base', 'extrude', 'chip', 'barbecue', 'flavor']
['snack', 'corn', 'base', 'extrude', 'cone', 'plain']
['snack', 'fruit', 'leather', 'piece']
['snack', 'fruit', 'leather', 'roll']
['snack', 'granola', 'bar', 'hard', 'plain']
['snack', 'granola', 'bar', 'hard', 'almond']
['snack', 'granola', 'bar', 'soft', 'uncoated', 'raisin']
['snack', 'granola', 'bar', 'soft', 'coated', 'milk', 'chocolate', 'coating', 'chocolate', 'chip']
['candy', 'honey', 'comb', 'peanut', 'butter']
['snack', 'granola', 'bar', 'soft', 'coated', 'milk', 'chocolate', 'coating', 'peanut', 'butter']
['snack', 'granola', 'bar', 'soft', 'uncoated', 'peanut', 'butter', 'chocolate', 'chip']
['snack', 'popcorn', 'oil', 'pop', 'microwave', 'regular', 'flavor', 'trans', 'fat']
['snack', 'popcorn', 'cake']
['snack', 'popcorn', 'caramel', 'coat', 'peanut']
['snack',

['grape', 'muscadine', 'raw']
['grape', 'juice', 'can', 'bottled', 'unsweetened', 'add', 'ascorbic', 'acid']
['grape', 'juice', 'can', 'bottled', 'unsweetened', 'add', 'ascorbic', 'acid']
['groundcherrie', 'cape', 'gooseberry', 'poha', 'raw']
['guavas', 'common', 'raw']
['guavas', 'strawberry', 'raw']
['beef', 'australian', 'import', 'grass', 'feed', 'ground', 'lean', 'fat', 'raw']
['beef', 'ground', 'lean', 'meat', 'fat', 'raw']
['beef', 'ground', 'lean', 'meat', 'fat', 'raw']
['beef', 'ground', 'lean', 'meat', 'fat', 'patty', 'cook', 'broil']
['beef', 'ground', 'lean', 'meat', 'fat', 'patty', 'cook', 'pan', 'broil']
['beef', 'ground', 'lean', 'meat', 'fat', 'loaf', 'cook', 'bake']
['beef', 'round', 'round', 'steak', 'separable', 'lean', 'trim', 'fat', 'choice', 'cook', 'braise']
['beef', 'short', 'loin', 'loin', 'steak', 'separable', 'lean', 'trim', 'fat', 'choice', 'raw']
['beef', 'tenderloin', 'steak', 'separable', 'lean', 'trim', 'fat', 'choice', 'cook', 'broil']
['beef', 'sirloin

In [15]:
print(f'Total number of Observations: {len(food_dataframe.index)}') 

Total number of Observations: 4771


## Raw food number test

In [11]:
df_raw = pd.DataFrame(columns = ['raw_food_tokens'])
for food in food_dataframe['food_tokens']:
    if 'raw' in food:
        df_raw = df_raw.append({'raw_food_tokens': food}, ignore_index = True)

In [12]:
for food in df_raw['raw_food_tokens']:

    print(food)

['deer', 'venison', 'sitka', 'raw', 'alaska', 'native']
['lemon', 'raw', 'peel']
['lemon', 'juice', 'raw']
['prickly', 'pear', 'raw']
['pummelo', 'raw']
['raspberry', 'raw']
['rhubarb', 'raw']
['sapodilla', 'raw']
['sapote', 'mamey', 'raw']
['soursop', 'raw']
['strawberry', 'raw']
['tamarind', 'raw']
['watermelon', 'raw']
['grapefruit', 'juice', 'pink', 'raw']
['abiyuch', 'raw']
['rowal', 'raw']
['pork', 'fresh', 'composite', 'trim', 'leg', 'loin', 'shoulder', 'sparerib', 'include', 'cut', 'cure', 'separable', 'lean', 'fat', 'raw']
['pork', 'fresh', 'backfat', 'raw']
['pork', 'fresh', 'belly', 'raw']
['pork', 'fresh', 'separable', 'fat', 'raw']
['pork', 'fresh', 'leg', 'ham', 'rump', 'half', 'separable', 'lean', 'fat', 'raw']
['pork', 'fresh', 'leg', 'ham', 'rump', 'half', 'separable', 'lean', 'raw', 'include', 'food', 'usda', 'food', 'distribution', 'program']
['pork', 'fresh', 'loin', 'separable', 'lean', 'fat', 'raw']
['pork', 'fresh', 'loin', 'blade', 'chop', 'roast', 'bone', 'sepa

In [13]:
print(f'raw food number: {len(df_raw.index)}') 

raw food number: 972
