This notebook combined different attributes of HC from multiple data sources 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
PATH = "F:/colla/Documents/work/实习/RA/dataset"

# Combine Approved HC Data

In [None]:
FILE = "EFSA_Approved_Health_Claims.xlsx"
df_hc = pd.read_excel('{}/{}'.format(PATH,FILE), header=None)
df_hc.columns = ['hc']
df_hc['hc'] = [e.replace('</p>','').lower() for e in df_hc['hc']]
df_hc.head()

Unnamed: 0,hc
0,ala contributes to the maintenance of normal b...
1,activated charcoal contributes to reducing exc...
2,barley grain fibre contributes to an increase ...
3,beta-glucans contribute to the maintenance of ...
4,betaine contributes to normal homocysteine met...


In [None]:
df_hc.shape

(261, 1)

In [None]:
FILE = "EU Health Claims - EN, DE, PL, FR.xlsx"
# data quality issue, has more NaN than EU Register file
df_hc_cat = pd.read_excel('{}/{}'.format(PATH,FILE))
df_hc_cat = df_hc_cat.iloc[1:,:2]
df_hc_cat.columns = ['nutrient','hc']
df_hc_cat['hc'] = [e.lower()+'.' for e in df_hc_cat['hc']]
df_hc_cat.head()

Unnamed: 0,nutrient,hc
1,Activated charcoal,activated charcoal contributes to reducing exc...
2,Alpha-cyclodextrin,consumption of alpha-cyclodextrin as part of a...
3,Alpha-linolenic acid & linoleic acid,essential fatty acids are needed for normal gr...
4,Alpha-linolenic acid (ALA),ala contributes to the maintenance of normal b...
5,Arabinoxylan produced from wheat endosperm,consumption of arabinoxylan as part of a meal ...


In [None]:
df_hc_cat.shape

(260, 2)

In [None]:
FILE = "EU Register on Nutrition and Health Claims.xlsx"
df_hc_cond = pd.read_excel('{}/{}'.format(PATH,FILE))
df_hc_cond.columns = ['nutrient','hc','condition']
df_hc_cond['hc'] = [e.strip().lower()+'.' for e in df_hc_cond['hc']]
df_hc_cond['nutrient'] = df_hc_cond['nutrient'].str.lower()
df_hc_cond.head()

Unnamed: 0,nutrient,hc,condition
0,"a combination of l-threonine, l-valine, l-leuc...",contributes to the reduction of the blood gluc...,
1,activated charcoal,activated charcoal contributes to reducing exc...,The claim may be used only for food which cont...
2,alpha-cyclodextrin,consumption of alpha-cyclodextrin as part of a...,The claim may be used for food which contains ...
3,alpha-linolenic acid (ala),ala contributes to the maintenance of normal b...,The claim may be used only for food which is a...
4,alpha-linolenic acid (ala) & linoleic acid (la...,essential fatty acids are needed for normal gr...,Information to the consumer that the beneficia...


In [None]:
df_hc_res = pd.merge(df_hc,df_hc_cond,how='left',on='hc')
df_hc_res.head()

Unnamed: 0,hc,nutrient,condition
0,ala contributes to the maintenance of normal b...,alpha-linolenic acid (ala),The claim may be used only for food which is a...
1,activated charcoal contributes to reducing exc...,,
2,barley grain fibre contributes to an increase ...,barley grain fibre,The claim may be used only for food which is h...
3,beta-glucans contribute to the maintenance of ...,beta-glucans,The claim may be used only for food which cont...
4,betaine contributes to normal homocysteine met...,betaine,The claim may be used only for food which cont...


In [None]:
df_hc_res[pd.isnull(df_hc_res['nutrient'])]

Unnamed: 0,hc,nutrient,condition
1,activated charcoal contributes to reducing exc...,,
35,consumption of foods/drinks containing &lt;nam...,,
36,in the case of d-tagatose and isomaltulose thi...,,
37,consumption of foods/drinks containing xx of s...,,
231,consumption of foods/drinks containing xx of a...,,
232,consumption of foods/drinks containing xx of a...,,
235,sugar beet fibre contributes to an&nbsp;increa...,,


In [None]:
FILE = "EFSA Claims with Keywords.xlsx"
df_hc_key = pd.read_excel('{}/{}'.format(PATH,FILE))
df_hc_key = df_hc_key.iloc[1:,[0,11]]
df_hc_key.columns = ['hc','keywords']
df_hc_key['hc'] = [e.lower() for e in df_hc_key['hc']]
# df_hc_key['nutrient'] = df_hc_cond['nutrient'].str.lower()
df_hc_key.head()

Unnamed: 0,hc,keywords
1,activated charcoal contributes to reducing exc...,"charcoal, flatulence"
2,ala contributes to the maintenance of normal b...,"ALA, alpha-linolenic acid, omega 3, blood, cho..."
3,barley beta-glucans has been shown to lower/re...,"barley, beta-glucans, blood, cholesterol, hear..."
4,barley grain fibre contributes to an increase ...,"barley, grain, fibre, fiber, faecal bulk, faeces"
5,beta-glucans contribute to the maintenance of ...,"beta-glucans, blood, cholesterol"


In [None]:
df_hc_key.shape

(260, 2)

In [None]:
df_hc_res1 = pd.merge(df_hc_res, df_hc_key, how='left',on='hc')

In [None]:
df_hc_res1[pd.isnull(df_hc_res1['nutrient'])]

Unnamed: 0,hc,nutrient,condition,keywords
1,activated charcoal contributes to reducing exc...,,,"charcoal, flatulence"
35,consumption of foods/drinks containing &lt;nam...,,,
36,in the case of d-tagatose and isomaltulose thi...,,,
37,consumption of foods/drinks containing xx of s...,,,
231,consumption of foods/drinks containing xx of a...,,,
232,consumption of foods/drinks containing xx of a...,,,
235,sugar beet fibre contributes to an&nbsp;increa...,,,


In [None]:
# NaN value for neutirent
for i in df_hc_res[pd.isnull(df_hc_res['nutrient'])]['hc']:
    print(i)
    print()

consumption of foods/drinks containing xx of all used non-digestible carbohydrates; instead of sugars induces a lower blood glucose rise after their consumption compared to sugar-containing foods/drinks.

consumption of foods/drinks containing xx of all used non-fermentable carbohydrates; instead of fermentable carbohydrates contributes to the maintenance of tooth mineralisation.

sugar beet fibre contributes to an&nbsp;increase in&nbsp;faecal bulk.



In [None]:
# Manually populate neutirent value for few NaN value
df_hc_res.loc[df_hc_res[df_hc_res['hc'].str.contains("activated charcoal")].index,'nutrient']='activated charcoal'
df_hc_res.loc[df_hc_res[df_hc_res['hc'].str.contains("d-tagatose and isomaltulose")].index,'nutrient']='d-tagatose and isomaltulose'
df_hc_res.loc[df_hc_res[df_hc_res['hc'].str.contains("sugar replacer")].index,'nutrient']='sugar replacer'
df_hc_res.loc[df_hc_res[df_hc_res['hc'].str.contains("non-digestible carbohydrates")].index,'nutrient']='non-digestible carbohydrates'
df_hc_res.loc[df_hc_res[df_hc_res['hc'].str.contains("non-fermentable carbohydrates")].index,'nutrient']='non-fermentable carbohydrates'
df_hc_res.loc[df_hc_res[df_hc_res['hc'].str.contains("sugar beet fibre")].index,'nutrient']='sugar beet fibre'

In [None]:
df_hc_res[pd.isnull(df_hc_res['keywords'])]

Unnamed: 0,hc_ID,hc,nutrient,condition,keywords,pos,hc_tagged
35,35,consumption of foods/drinks containing &lt;nam...,sugar replacer,,,"[NN, IN, NNS, VBG, CC, NN, :, NN, IN, NN, NN, ...","[(consumption, NN), (of, IN), (foods/drinks, N..."
36,36,in the case of d-tagatose and isomaltulose thi...,d-tagatose and isomaltulose,,,"[IN, DT, NN, IN, JJ, CC, VB, DT, MD, VB, ``, J...","[(in, IN), (the, DT), (case, NN), (of, IN), (d..."
37,37,consumption of foods/drinks containing xx of s...,sugar replacer,,,"[NN, IN, NNS, VBG, NN, IN, NN, NN, :, RB, IN, ...","[(consumption, NN), (of, IN), (foods/drinks, N..."
129,129,replacing digestible starches with resistant s...,resistant starch,The claim may be used only for food in which d...,,"[VBG, JJ, NNS, IN, JJ, NN, IN, DT, NN, VBZ, TO...","[(replacing, VBG), (digestible, JJ), (starches..."
130,130,replacing saturated fats in the diet with unsa...,oleic acid,The claim may be used only for food which is h...,,"[VBG, VBN, NNS, IN, DT, NN, IN, JJ, NNS, VBZ, ...","[(replacing, VBG), (saturated, VBN), (fats, NN..."
231,231,consumption of foods/drinks containing xx of a...,non-digestible carbohydrates,,,"[NN, IN, NNS, VBG, NN, IN, DT, VBN, JJ, NNS, :...","[(consumption, NN), (of, IN), (foods/drinks, N..."
232,232,consumption of foods/drinks containing xx of a...,non-fermentable carbohydrates,,,"[NN, IN, NNS, VBG, NN, IN, DT, VBN, JJ, NNS, :...","[(consumption, NN), (of, IN), (foods/drinks, N..."
233,233,daily creatine consumption can enhance the eff...,creatine,Information shall be provided to the consumer ...,,"[JJ, NN, NN, MD, VB, DT, NN, IN, NN, NN, IN, N...","[(daily, JJ), (creatine, NN), (consumption, NN..."
235,235,sugar beet fibre contributes to an&nbsp;increa...,sugar beet fibre,,,"[NN, NN, NN, VBZ, TO, DT, CC, NN, :, NN, IN, C...","[(sugar, NN), (beet, NN), (fibre, NN), (contri..."
238,238,vitamin d helps to reduce the risk of falling ...,vitamin d,The claim may be used only for food supplement...,,"[NN, NN, VBZ, TO, VB, DT, NN, IN, VBG, VBN, IN...","[(vitamin, NN), (d, NN), (helps, VBZ), (to, TO..."


In [None]:
df_hc_res['hc_ID'] = df_hc_res.index
df_hc_res = df_hc_res[['hc_ID']+list(df_hc_res.columns[:-1])]

# Adding Pos-tagging

In [None]:
df_hc_res.loc[:,'pos'] = [[i[1] for i in nltk.pos_tag(word_tokenize(s))] for s in df_hc_res.hc]
df_hc_res.loc[:,'hc_tagged'] = [nltk.pos_tag(word_tokenize(s)) for s in df_hc_res.hc]
df_hc_res.head()

Unnamed: 0,hc_ID,hc,nutrient,condition,keywords,pos,hc_tagged
0,0,ala contributes to the maintenance of normal b...,alpha-linolenic acid (ala),The claim may be used only for food which is a...,"ALA, alpha-linolenic acid, omega 3, blood, cho...","[NN, NNS, TO, DT, NN, IN, JJ, NN, NN, NNS, .]","[(ala, NN), (contributes, NNS), (to, TO), (the..."
1,1,activated charcoal contributes to reducing exc...,,,"charcoal, flatulence","[VBN, NN, NNS, TO, VBG, JJ, NN, IN, VBG, .]","[(activated, VBN), (charcoal, NN), (contribute..."
2,2,barley grain fibre contributes to an increase ...,barley grain fibre,The claim may be used only for food which is h...,"barley, grain, fibre, fiber, faecal bulk, faeces","[NN, NN, NN, VBZ, TO, DT, NN, IN, JJ, NN, .]","[(barley, NN), (grain, NN), (fibre, NN), (cont..."
3,3,beta-glucans contribute to the maintenance of ...,beta-glucans,The claim may be used only for food which cont...,"beta-glucans, blood, cholesterol","[NNS, NN, TO, DT, NN, IN, JJ, NN, NN, NNS, .]","[(beta-glucans, NNS), (contribute, NN), (to, T..."
4,4,betaine contributes to normal homocysteine met...,betaine,The claim may be used only for food which cont...,"betaine, homocysteine, metabolism","[NN, VBZ, TO, JJ, JJ, NN, .]","[(betaine, NN), (contributes, VBZ), (to, TO), ..."


In [None]:
def drawTree(tagged):
    grammar = r"""
      NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
      PP: {<IN><NP>}               # Chunk prepositions followed by NP
      VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
      CLAUSE: {<NP><VP>}           # Chunk NP, VP
      """
    cp = nltk.RegexpParser(grammar)
    chunked = cp.parse(tagged)
    chunked.draw() 

# Export Combine Approved HC data

In [None]:
import pickle
# with open('df_hc_res.pickle', 'wb') as handle:
#     pickle.dump(df_hc_res1, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('df_hc_res.pickle', 'rb') as handle:
    df_hc_res = pickle.load(handle)

In [None]:
df_hc_res[df_hc_res['nutrient']=='live yoghurt cultures']['hc'][83]

'live cultures in yoghurt or fermented milk improve lactose digestion of the product in individuals who have difficulty digesting lactose.'

# Extract benefit and phrase

In [None]:
df_hc_res.loc[:,'benefit'] = [s.split( ' to ')[1].replace('.','').strip() if ' to ' in s else '' for s in df_hc_res.hc]
df_hc_res.loc[:,'phrase'] = [s.split( ' to ')[0].split(' ')[-1].strip()+' to' if ' to ' in s else '' for s in df_hc_res.hc]

In [None]:
for i in df_hc_res[df_hc_res.benefit==''].index:
    hc = df_hc_res.loc[i, 'hc']
    phrases = ['needed for', 'has a role in','enhance','increases','improve','improves','helps']
    for p in phrases:
        if f' {p} ' in hc:
            df_hc_res.loc[i, 'phrase'] = p
            df_hc_res.loc[i, 'benefit'] = hc.split(f' {p} ')[1].replace('.','').strip()
        

In [None]:
df_hc_res[df_hc_res.benefit==''].hc.values

array(['in the case of d-tagatose and isomaltulose this should read "other sugars".',
       'sugar-free chewing gum with carbamide neutralises plaque acids more effectively than sugar-free chewing gums without carbamide.'],
      dtype=object)

In [None]:
df_hc_res

Unnamed: 0,hc_ID,hc,nutrient,condition,keywords,pos,hc_tagged,benefit,phrase
0,0,ala contributes to the maintenance of normal b...,alpha-linolenic acid (ala),The claim may be used only for food which is a...,"ALA, alpha-linolenic acid, omega 3, blood, cho...","[NN, NNS, TO, DT, NN, IN, JJ, NN, NN, NNS, .]","[(ala, NN), (contributes, NNS), (to, TO), (the...",the maintenance of normal blood cholesterol le...,contributes to
1,1,activated charcoal contributes to reducing exc...,activated charcoal,,"charcoal, flatulence","[VBN, NN, NNS, TO, VBG, JJ, NN, IN, VBG, .]","[(activated, VBN), (charcoal, NN), (contribute...",reducing excessive flatulence after eating,contributes to
2,2,barley grain fibre contributes to an increase ...,barley grain fibre,The claim may be used only for food which is h...,"barley, grain, fibre, fiber, faecal bulk, faeces","[NN, NN, NN, VBZ, TO, DT, NN, IN, JJ, NN, .]","[(barley, NN), (grain, NN), (fibre, NN), (cont...",an increase in faecal bulk,contributes to
3,3,beta-glucans contribute to the maintenance of ...,beta-glucans,The claim may be used only for food which cont...,"beta-glucans, blood, cholesterol","[NNS, NN, TO, DT, NN, IN, JJ, NN, NN, NNS, .]","[(beta-glucans, NNS), (contribute, NN), (to, T...",the maintenance of normal blood cholesterol le...,contribute to
4,4,betaine contributes to normal homocysteine met...,betaine,The claim may be used only for food which cont...,"betaine, homocysteine, metabolism","[NN, VBZ, TO, JJ, JJ, NN, .]","[(betaine, NN), (contributes, VBZ), (to, TO), ...",normal homocysteine metabolism,contributes to
5,5,biotin contributes to normal energy-yielding m...,biotin,The claim may be used only for food which is a...,"biotin, vitamin B7, energy, metabolism","[NN, VBZ, TO, JJ, JJ, NN, .]","[(biotin, NN), (contributes, VBZ), (to, TO), (...",normal energy-yielding metabolism,contributes to
6,6,biotin contributes to normal functioning of th...,biotin,The claim may be used only for food which is a...,"biotin, vitamin B7, nervous system","[NN, VBZ, TO, JJ, NN, IN, DT, JJ, NN, .]","[(biotin, NN), (contributes, VBZ), (to, TO), (...",normal functioning of the nervous system,contributes to
7,7,biotin contributes to normal macronutrient met...,biotin,The claim may be used only for food which is a...,"biotin, vitamin B7, metabolism","[NN, VBZ, TO, JJ, JJ, NN, .]","[(biotin, NN), (contributes, VBZ), (to, TO), (...",normal macronutrient metabolism,contributes to
8,8,biotin contributes to normal psychological fun...,biotin,The claim may be used only for food which is a...,"biotin, vitamin B7, psychological, psychology","[NN, VBZ, TO, JJ, JJ, NN, .]","[(biotin, NN), (contributes, VBZ), (to, TO), (...",normal psychological function,contributes to
9,9,biotin contributes to the maintenance of norma...,biotin,The claim may be used only for food which is a...,"biotin, vitamin B7, hair","[NN, NNS, TO, DT, NN, IN, JJ, NN, .]","[(biotin, NN), (contributes, NNS), (to, TO), (...",the maintenance of normal hair,contributes to


In [None]:
benefits = df_hc_res.benefit.unique()
pd.Series(benefits).to_csv('benefits.csv',index=None)

In [None]:
df_hc_res.to_csv('hc_approval.csv',index=None)

In [None]:
phrases = df_hc_res.phrase.unique()
pd.Series(phrases).to_csv('phrases.csv',index=None)

In [None]:
# Get the unique benefit and phrase

phrases = [[ps.stem(word) for word in word_tokenize(p)] for p in phrases]