In [10]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pickle
from nltk.corpus import stopwords
stopwordEn = stopwords.words('english')

def drawTree(tagged):
    grammar = r"""
      NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
      PP: {<IN><NP>}               # Chunk prepositions followed by NP
      VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
      CLAUSE: {<NP><VP>}           # Chunk NP, VP
      """
    cp = nltk.RegexpParser(grammar)
    chunked = cp.parse(tagged)
    chunked.draw() 

with open('df_hc_res.pickle', 'rb') as handle:
    df_hc_res = pickle.load(handle)
with open('df_hc_manu.pickle', 'rb') as handle:
    df_hc_manu = pickle.load(handle)
with open('cat_hc.pickle', 'rb') as handle:
    cat_hc = pickle.load(handle)

# Find HCs having identical nutrients 

In [2]:
df_hc_manu.head()

Unnamed: 0,hc,nutrient,nutrient_num
0,bones: vitamin d supports normal bones.,[vitamin d],1
1,bones: vitamin d supports the maintenance of n...,[vitamin d],1
2,brain: dha supports normal brain function.,[],0
3,calcium & vitamin d support normal bones.,"[calcium, vitamin d]",2
4,dha supports normal brain function.,[],0


In [3]:
hc = df_hc_manu.iloc[0,0]
hc

'bones: vitamin d supports normal bones.'

In [4]:
hc_tag = [i[1] for i in nltk.pos_tag(word_tokenize(hc))]
hc_tag

['NNS', ':', 'NN', 'NN', 'VBZ', 'JJ', 'NNS', '.']

In [5]:
neu = cat_hc.loc[df_hc_manu.iloc[0,1]]
neu

Unnamed: 0_level_0,approved_hc_ID,approved_count,manu_hc_ID,manu_count
nutrient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
vitamin d,"[197, 198, 199, 200, 201, 202, 203, 238, 259, ...",10,"[0, 1, 3, 12, 13, 14, 19, 20, 21, 22, 24, 25, ...",483


In [6]:
neu.iloc[0,0]

[197, 198, 199, 200, 201, 202, 203, 238, 259, 260]

In [7]:
df_hc_res.iloc[neu.iloc[0,0]]

Unnamed: 0,hc_ID,hc,nutrient,condition,keywords,pos,hc_tagged
197,197,vitamin d contributes to normal absorption/uti...,vitamin d,The claim may be used only for food which is a...,"vitamin D, absoption, utilisation, utilization...","[NN, NN, VBZ, TO, JJ, NN, IN, NN, CC, NN, .]","[(vitamin, NN), (d, NN), (contributes, VBZ), (..."
198,198,vitamin d contributes to normal blood calcium ...,vitamin d,The claim may be used only for food which is a...,"vitamin D, blood, calcium","[NN, NN, VBZ, TO, JJ, NN, NN, NNS, .]","[(vitamin, NN), (d, NN), (contributes, VBZ), (..."
199,199,vitamin d contributes to the maintenance of no...,vitamin d,The claim may be used only for food which is a...,"vitamin D, bones","[NN, NN, VBZ, TO, DT, NN, IN, JJ, NNS, .]","[(vitamin, NN), (d, NN), (contributes, VBZ), (..."
200,200,vitamin d contributes to the maintenance of no...,vitamin d,The claim may be used only for food which is a...,"vitamin D, muscle","[NN, NN, VBZ, TO, DT, NN, IN, JJ, NN, NN, .]","[(vitamin, NN), (d, NN), (contributes, VBZ), (..."
201,201,vitamin d contributes to the maintenance of no...,vitamin d,The claim may be used only for food which is a...,"vitamin D, teeth","[NN, NN, VBZ, TO, DT, NN, IN, JJ, NNS, .]","[(vitamin, NN), (d, NN), (contributes, VBZ), (..."
202,202,vitamin d contributes to the normal function o...,vitamin d,The claim may be used only for food which is a...,"vitamin D, immune system","[NN, NN, VBZ, TO, DT, JJ, NN, IN, DT, NN, NN, .]","[(vitamin, NN), (d, NN), (contributes, VBZ), (..."
203,203,vitamin d has a role in the process of cell di...,vitamin d,The claim may be used only for food which is a...,"vitamin D, cell division","[NN, NN, VBZ, DT, NN, IN, DT, NN, IN, NN, NN, .]","[(vitamin, NN), (d, NN), (has, VBZ), (a, DT), ..."
238,238,vitamin d helps to reduce the risk of falling ...,vitamin d,The claim may be used only for food supplement...,,"[NN, NN, VBZ, TO, VB, DT, NN, IN, VBG, VBN, IN...","[(vitamin, NN), (d, NN), (helps, VBZ), (to, TO..."
259,259,vitamin d contributes to the normal function o...,vitamin d,The claim may be used only for food which is a...,,"[NN, NN, VBZ, TO, DT, JJ, NN, IN, DT, NN, NN, ...","[(vitamin, NN), (d, NN), (contributes, VBZ), (..."
260,260,vitamin d is needed for normal growth and deve...,vitamin d,The claim can be used only for food which is a...,,"[NN, NN, VBZ, VBN, IN, JJ, NN, CC, NN, IN, NN,...","[(vitamin, NN), (d, NN), (is, VBZ), (needed, V..."


In [8]:
print(hc)
print(df_hc_res['hc'][199])
print(df_hc_res['hc'][201])
print(df_hc_res['hc'][197])
print(df_hc_res['hc'][252])

bones: vitamin d supports normal bones.
vitamin d contributes to the maintenance of normal bones.
vitamin d contributes to the maintenance of normal teeth.
vitamin d contributes to normal absorption/utilisation of calcium and phosphorus.
docosahexaenoic acid (dha) maternal intake contributes to the normal brain development of the foetus and breastfed infants..


# Calculate Words Similarity

In [20]:
# the fraction of the same words out of the whole words of two compared hc
from nltk.corpus import wordnet
def get_lemma(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def getTokens(hc):
    tokens = word_tokenize(hc)
    tokens = [get_lemma(w).lower() for w in tokens if w not in stopwordEn and w.isalpha()]
    return tokens

def wordSim(hc, ahc): # input raw health claims text
    hc, ahc =  getTokens(hc), getTokens(ahc)
    hit = 0
    for w in hc: # 不忽略hc中的重复
        for aw in ahc:
            if w==aw:
                hit += 1
                break #不用break，max sim != 1，会加强重复的word, 用了break，忽略ahc中重复的word
#     return hit,len(hc),len(ahc)
    return hit,hit/((len(hc)+len(ahc))/2)

wordSim(hc,hc),wordSim(hc, df_hc_res['hc'][199]),wordSim(hc, df_hc_res['hc'][201]),wordSim(hc, df_hc_res['hc'][197]),wordSim(hc, df_hc_res['hc'][252]),

((5, 1.0), (4, 0.8), (2, 0.4), (2, 0.4), (1, 0.125))

# Calculate Pos-tags Similarity 

In [21]:
print(hc_tag)
print(df_hc_res['pos'][199])
print(df_hc_res['pos'][201])
print(df_hc_res['pos'][197])
print(df_hc_res['pos'][252])

['NNS', ':', 'NN', 'NN', 'VBZ', 'JJ', 'NNS', '.']
['NN', 'NN', 'VBZ', 'TO', 'DT', 'NN', 'IN', 'JJ', 'NNS', '.']
['NN', 'NN', 'VBZ', 'TO', 'DT', 'NN', 'IN', 'JJ', 'NNS', '.']
['NN', 'NN', 'VBZ', 'TO', 'JJ', 'NN', 'IN', 'NN', 'CC', 'NN', '.']
['NN', 'NN', '(', 'NN', ')', 'JJ', 'NN', 'NNS', 'TO', 'DT', 'JJ', 'NN', 'NN', 'IN', 'DT', 'NN', 'CC', 'NN', 'NN']


In [22]:
#transform raw hc into postagging(for tree) or pos-tags-sequence (for similarity)
def getPos(hc, seq=True):
    postagging = nltk.pos_tag(word_tokenize(hc))
    hc_tag = [i[1] for i in postagging] if seq else [i for i in postagging]
    return hc_tag

In [23]:
# the number of the same two-gram pos tagging 
def posSim(hc_tag, ahc_tag): #input pos tag sequence
    # 1-gram
    hit = 0
    for t in hc_tag:
        for at in ahc_tag:
            if t==at:
                hit += 1
                break
    return hit, hit/((len(hc_tag)+len(ahc_tag))/2)

posSim(hc_tag,hc_tag),posSim(hc_tag, df_hc_res['pos'][199]),posSim(hc_tag, df_hc_res['pos'][201]),posSim(hc_tag, df_hc_res['pos'][197]),posSim(hc_tag, df_hc_res['pos'][252])

((8, 1.0),
 (7, 0.7777777777777778),
 (7, 0.7777777777777778),
 (5, 0.5263157894736842),
 (5, 0.37037037037037035))

In [24]:
# the number of the same two-gram pos tagging 
def pos2Sim(hc_tag, ahc_tag): #input pos tag sequence
    # 2-gram
    hit = 0
    dic_at2={}
    for i in range(len(hc_tag)-1):
        t2 = (hc_tag[i], hc_tag[i+1])
        for p in range(len(ahc_tag)-1):
            at2 = (ahc_tag[p], ahc_tag[p+1])
            # del duplicate t2 in ahc
            if t2 == at2 and at2 not in dic_at2:
                dic_at2[at2] = True
                hit += 1
                break
    return hit, hit/((len(hc_tag)+len(ahc_tag)-2)/2)
pos2Sim(hc_tag,hc_tag),pos2Sim(hc_tag, df_hc_res['pos'][199]),pos2Sim(hc_tag, df_hc_res['pos'][201]),pos2Sim(hc_tag, df_hc_res['pos'][197]),pos2Sim(hc_tag, df_hc_res['pos'][252])

((7, 1.0), (4, 0.5), (4, 0.5), (2, 0.23529411764705882), (1, 0.08))

In [25]:
def pos2Sim_hc(hc, ahc): #input hc raw text
    hc_tag, ahc_tag = getPos(hc), getPos(ahc)
    # 2-gram
    hit = 0
    dic_at2={}
    for i in range(len(hc_tag)-1):
        t2 = (hc_tag[i], hc_tag[i+1])
        for p in range(len(ahc_tag)-1):
            at2 = (ahc_tag[p], ahc_tag[p+1])
            # del duplicate t2 in ahc
            if t2 == at2 and at2 not in dic_at2:
                dic_at2[at2] = True
                hit += 1
                break
    return hit, hit/((len(hc_tag)+len(ahc_tag)-2)/2)
pos2Sim(hc,hc),pos2Sim(hc, df_hc_res['hc'][199]),pos2Sim(hc, df_hc_res['hc'][201]),pos2Sim(hc, df_hc_res['hc'][197]),pos2Sim(hc, df_hc_res['hc'][252]),

((33, 0.868421052631579),
 (23, 0.48936170212765956),
 (19, 0.40425531914893614),
 (20, 0.3389830508474576),
 (18, 0.2236024844720497))

In [26]:
def avgSim_hc(hc, ahc): #不适用df_hc_res中的pos列，直接从hc文本计算
    return np.mean([wordSim(hc, ahc)[1],pos2Sim_hc(hc, ahc)[1]])

avgSim_hc('calcium & vitamin d support normal bones.','vitamin d is needed for normal growth and development of bone in children.')

0.3666666666666667

# Calculate Average Similarity

In [27]:
df_hc_res.head()

Unnamed: 0,hc_ID,hc,nutrient,condition,keywords,pos,hc_tagged
0,0,ala contributes to the maintenance of normal b...,alpha-linolenic acid (ala),The claim may be used only for food which is a...,"ALA, alpha-linolenic acid, omega 3, blood, cho...","[NN, NNS, TO, DT, NN, IN, JJ, NN, NN, NNS, .]","[(ala, NN), (contributes, NNS), (to, TO), (the..."
1,1,activated charcoal contributes to reducing exc...,activated charcoal,,"charcoal, flatulence","[VBN, NN, NNS, TO, VBG, JJ, NN, IN, VBG, .]","[(activated, VBN), (charcoal, NN), (contribute..."
2,2,barley grain fibre contributes to an increase ...,barley grain fibre,The claim may be used only for food which is h...,"barley, grain, fibre, fiber, faecal bulk, faeces","[NN, NN, NN, VBZ, TO, DT, NN, IN, JJ, NN, .]","[(barley, NN), (grain, NN), (fibre, NN), (cont..."
3,3,beta-glucans contribute to the maintenance of ...,beta-glucans,The claim may be used only for food which cont...,"beta-glucans, blood, cholesterol","[NNS, NN, TO, DT, NN, IN, JJ, NN, NN, NNS, .]","[(beta-glucans, NNS), (contribute, NN), (to, T..."
4,4,betaine contributes to normal homocysteine met...,betaine,The claim may be used only for food which cont...,"betaine, homocysteine, metabolism","[NN, VBZ, TO, JJ, JJ, NN, .]","[(betaine, NN), (contributes, VBZ), (to, TO), ..."


In [28]:
def avgSim(hc_ID):  #find hc having same nutrient, calculate avg sim, using pos tags in the df
    res_wordSim, res_pos2Sim = [],[]
    hc = df_hc_manu.iloc[hc_ID,0]
    nutrients = cat_hc.loc[df_hc_manu.iloc[hc_ID,1]]
        
    if len(nutrients>0):
        hc_tag = [i[1] for i in nltk.pos_tag(word_tokenize(hc))]
        candidates_ahc = {}
        for i in nutrients['approved_hc_ID']:
            for e in i:
                candidates_ahc[e] = True #通过dict快速去重ID
        for ahc in candidates_ahc.keys():
            ahc_hc = df_hc_res['hc'][ahc]
            ahc_tag = df_hc_res['pos'][ahc] 
            res_wordSim.append(wordSim(hc, ahc_hc)[1])
            res_pos2Sim.append(pos2Sim(hc_tag, ahc_tag)[1])
        res = pd.DataFrame({'wordSim':res_wordSim,'pos2Sim':res_pos2Sim,'avgSim':None}, index=candidates_ahc)
        res['avgSim'] = res.iloc[:,:2].mean(axis=1)
        return res.sort_values(by=['avgSim'], ascending=False)
    else:
        print('Nutrient Not Found!')
        return False

In [29]:
df_hc_manu[df_hc_manu['nutrient'].str.len()>0].head()

Unnamed: 0,hc,nutrient,nutrient_num
0,bones: vitamin d supports normal bones.,[vitamin d],1
1,bones: vitamin d supports the maintenance of n...,[vitamin d],1
3,calcium & vitamin d support normal bones.,"[calcium, vitamin d]",2
12,immunity: vitamin d contributes to the normal ...,[vitamin d],1
13,muscles: vitamin d contributes to the maintena...,[vitamin d],1


In [30]:
avgSim(4193).head(5)

Unnamed: 0,wordSim,pos2Sim,avgSim
97,0.857143,0.64,0.748571
228,0.857143,0.64,0.748571
147,0.857143,0.64,0.748571
222,0.307692,0.521739,0.414716
143,0.307692,0.521739,0.414716


In [31]:
QUERY_ID = 4193
res = avgSim(QUERY_ID)
print('Query:',df_hc_manu['hc'][QUERY_ID])
for i in res.index[:5]:
    print(i, df_hc_res['hc'][i])

Query: zinc, manganese and selenium which contribute to the protection of cells from oxidative stress.
97 manganese contributes to the protection of cells from oxidative stress.
228 zinc contributes to the protection of cells from oxidative stress.
147 selenium contributes to the protection of cells from oxidative stress.
222 zinc contributes to the maintenance of normal hair.
143 selenium contributes to the maintenance of normal hair.


In [32]:
print('Query:',df_hc_manu['hc'][4192])

Query: zinc, iron, vitamins a & c   key minerals and vitamins that participate in the good functioning of your baby's immune system.


In [151]:
cat_hc[cat_hc.index.str.contains('vitamin ')]

Unnamed: 0_level_0,approved_hc_ID,approved_count,manu_hc_ID,manu_count
nutrient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
calcium and vitamin d,"[236, 249]",2,"[331, 332, 333, 334, 335, 336, 337, 338, 339, ...",52
riboflavin (vitamin b2),"[132, 133, 134, 135, 136, 137, 138, 139, 140]",9,[],0
vitamin a,"[158, 159, 160, 161, 162, 163]",6,"[23, 102, 103, 118, 601, 602, 690, 848, 1159, ...",141
vitamin b12,"[164, 165, 166, 167, 168, 169, 170, 171]",8,"[40, 50, 51, 109, 118, 529, 587, 588, 589, 769...",129
vitamin b6,"[172, 173, 174, 175, 176, 177, 178, 179, 180, ...",10,"[18, 39, 41, 44, 49, 54, 118, 195, 254, 255, 2...",174
vitamin c,"[182, 183, 184, 185, 186, 187, 188, 189, 190, ...",15,"[54, 92, 101, 104, 105, 106, 111, 118, 136, 14...",513
vitamin d,"[197, 198, 199, 200, 201, 202, 203, 238, 259, ...",10,"[0, 1, 3, 12, 13, 14, 19, 20, 21, 22, 24, 25, ...",483
vitamin e,[204],1,"[72, 78, 107, 134, 138, 200, 599, 644, 645, 66...",112
vitamin k,"[205, 206]",2,"[79, 113, 367, 556, 628, 646, 647, 993, 1224, ...",19


* 多个nutrient同时存在时如何对应，nutrient DB不足无法很好mapping customer hc

In [115]:
df_hc_manu.iloc[4172,:]

hc              zinc to contribute to protein synthesis.
nutrient                                 [protein, zinc]
nutrient_num                                           2
Name: 4172, dtype: object

In [466]:
drawTree(df_hc_res['hc_tagged'][259])

In [394]:
drawTree(df_hc_res['hc_tagged'][202])

In [467]:
tagged=[i for i in nltk.pos_tag(word_tokenize(df_hc_manu.iloc[12,0]))]
drawTree(tagged)

In [110]:
res = avgSim(12)
print('Query:',df_hc_manu['hc'][12])
for i in res.index:
    print(i, df_hc_res['hc'][i])

Query: immunity: vitamin d contributes to the normal function of the immune system.
202 vitamin d contributes to the normal function of the immune system.
259 vitamin d contributes to the normal function of the immune system in children..
200 vitamin d contributes to the maintenance of normal muscle function.
199 vitamin d contributes to the maintenance of normal bones.
201 vitamin d contributes to the maintenance of normal teeth.
197 vitamin d contributes to normal absorption/utilisation of calcium and phosphorus.
203 vitamin d has a role in the process of cell division.
198 vitamin d contributes to normal blood calcium levels.
260 vitamin d is needed for normal growth and development of bone in children..
238 vitamin d helps to reduce the risk of falling associated with postural instability and muscle weakness. falling is a risk factor for bone fractures among men and women 60 years of age and older..


In [420]:
print([i[1] for i in nltk.pos_tag(word_tokenize(df_hc_manu.iloc[12,0]))])

['NN', ':', 'NN', 'NN', 'VBZ', 'TO', 'DT', 'JJ', 'NN', 'IN', 'DT', 'NN', 'NN', '.']


In [469]:
df_hc_res['pos'][[203,198]]

203    [NN, NN, VBZ, DT, NN, IN, DT, NN, IN, NN, NN, .]
198               [NN, NN, VBZ, TO, JJ, NN, NN, NNS, .]
Name: pos, dtype: object

In [479]:
pos2Sim([i[1] for i in nltk.pos_tag(word_tokenize(df_hc_manu.iloc[12,0]))], df_hc_res['pos'][198])

(4, 0.38095238095238093)

In [480]:
pos2Sim([i[1] for i in nltk.pos_tag(word_tokenize(df_hc_manu.iloc[12,0]))], df_hc_res['pos'][203])

(6, 0.5)

In [481]:
posSim([i[1] for i in nltk.pos_tag(word_tokenize(df_hc_manu.iloc[12,0]))], df_hc_res['pos'][198])

(10, 0.8695652173913043)

In [482]:
posSim([i[1] for i in nltk.pos_tag(word_tokenize(df_hc_manu.iloc[12,0]))], df_hc_res['pos'][203])

(11, 0.8461538461538461)