### Title: 01_text_matching
### Purpose: Match FNDDS ingredient descriptions to Food Tree taxa
### Date: March 14, 2024
### Author: Jules Larke

In [2]:
import pandas as pd
import string
import re
import nltk
nltk.data.path.append('/Users/jules.larke/opt/anaconda3/nltk_data')
wn = nltk.WordNetLemmatizer()

In [3]:
from polyfuzz.models import TFIDF
from polyfuzz import PolyFuzz

In [4]:
taxon = pd.read_csv('../../data/01/NodeLabelsMCT_edit.txt', sep='\t')

In [5]:
taxon['Main.food.description'] = taxon['Main.food.description'].str.replace('_', ' ')

In [6]:
ingredients = pd.read_csv('../../data/01/fndds_all_ingredient_nutrient_values.csv', usecols=['Ingredient code', 'Ingredient description'])

In [7]:
punct = string.punctuation[0:11] + string.punctuation[13:] # remove '-' from the list of punctuation.
stopwords = ['','and', 'to', 'not', 'no', 'in', 'with', 'or', 'only', 'cooking', 'as', 'food', 'distribution', 'form', 'a', 'd', 'ns', 'nfs']

def clean_text(text):
    text = "".join([word for word in text if word not in punct])
    tokens = re.split('[-\W+]', text)
    text = [word for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return "default" if text is [] else ' '.join(set(text))

taxon['taxon_clean'] = taxon['Main.food.description'].apply(lambda x: clean_text(x.lower()))
ingredients['ingredient_clean'] = ingredients['Ingredient description'].apply(lambda x: clean_text(x.lower()))

In [8]:
taxon_tokens = taxon['taxon_clean'].to_list()
ingredient_tokens = ingredients['ingredient_clean'].to_list()

In [9]:
tfidf = TFIDF(n_gram_range=(1, 3))
model = PolyFuzz(tfidf).match(ingredient_tokens, taxon_tokens)

In [10]:
match = model.get_matches()
match.rename(columns={'From':'ingredient_clean', 'To':'taxon_clean'},inplace=True)

In [11]:
matched_1 = match.merge(taxon, on='taxon_clean', how='left')
matched_1.drop_duplicates(subset='ingredient_clean')
matched_1

Unnamed: 0,ingredient_clean,taxon_clean,Similarity,Level.code,Main.food.description
0,butter salted,butter,0.755,81101,Butter
1,butter whipped salt,butter whipped,0.931,81107,Butter whipped
2,butter oil anhydrous,butter,0.472,81101,Butter
3,cheese blue,cheese,0.694,14,Cheeses
4,cheese brick,brick,0.731,14102,Brick
...,...,...,...,...,...
3220,pie apple canned filling,pie apple,0.651,53301,Pie apple
3221,lunch whole chicken breaded school patty grain,stick whole bread wheat,0.446,51306,Bread stick whole wheat
3222,lean separable beef all cooked fat grade stew ...,cereal cooked grain,0.417,576,Cereal grains not cooked
3223,added salt bean baked canned,bean baked,0.678,41201,Baked beans


In [12]:
matched_2 = ingredients.merge(matched_1, on='ingredient_clean', how='left').drop_duplicates(subset='Ingredient description')

In [13]:
matched_2.rename(columns={'Main.food.description':'taxon.food.description'},inplace=True)

In [14]:
matched_2.to_csv('../../data/01/text_matching_052223.csv', index=None)

## text_matching_052223.csv will undergo manual curation to determine appropriateness of matches and edits for non-appropriate matches 
## Output file following manual matching: updated_taxonomy_080323.csv