In [1]:
import re

import xml.etree.ElementTree

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

import numpy as np
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Reading and cleaning data

In [2]:
e = xml.etree.ElementTree.parse('recipeBaseCompulsory_clean.xml').getroot()

recipe_ingredients = []
recipe_title = []

for atype in e.findall('RECIPE'):
    ingredients = []
    
    recipe_title.append(atype.find('TI').text)
    
    for i in atype.findall('IN'):
        try:
            ingredients.append(i.text)
        except Exception:
            pass
    
    recipe_ingredients.append(ingredients)

In [3]:
tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()
stop_words = stopwords.words("english") 
stop_words.extend(['ounce', 'skinless', 'boneless', 'halves', 'cold', 'sized',
                   'cooked', 'unseasoned', 'colored', 'light', 'medium', 'thinly',
                   'coarsely', 'crushed', 'whole', 'recipe', 'pitted', 'bing'])

recipe_ingredients_clean = []
recipe_title_clean = []

for rec_tit in recipe_title:
    t = rec_tit.lower()
    words = tokenizer.tokenize(t)

    recipe_title_clean.append(' '.join([word for word in words 
                                       if word not in stop_words
                                       and not word.isdigit() 
                                       and len(word) > 3]))

recipe_title_clean = list(filter(None, recipe_title_clean))

    
for rec_ing in recipe_ingredients:
    ingredients_clean = []
    
    for ing in rec_ing:
        t = ing.lower()
        t = re.sub("[\(\[].*?[\)\]]", "", t) # Remove brackets
        t = t.split(',')[0]
        t = t.split(';')[0]
        t = t.split(' or ')[0]

        words = tokenizer.tokenize(t)

        ingredients_clean.append(' '.join([word for word in words 
                                           if word not in stop_words
                                           and not word.isdigit() 
                                           and len(word) > 3]))
        
    ingredients_clean = list(filter(None, ingredients_clean))
    recipe_ingredients_clean.append(ingredients_clean)

<br>
# Association Rules

Una regla de asociación busca regularidades en los datos. Intenta asociar datos, objetos, eventos... que aparecen juntos para buscar patrones, asociaciones o correlaciones.

El recuento ($\sigma$) es el número de veces que aparece un itemset en el dataset.

El soporte de la regla ($s$) es la probabilidad de que una transacción contenga a $X$ y a $Y$:

$$ s(X \rightarrow Y) = \frac{\sigma(X, Y)}{T} $$

La confianza de la regla ($c$) es la probabilidad de que una transacción que contenga a $X$ también contenga a $Y$:

$$ c(X \rightarrow Y) = \frac{\sigma(X, Y)}{\sigma(X)} $$

El algoritmo Apriori sirve para reducir los candidatos a itemset frecuente. Se basa en la pripiedad de que si un itemset no es frecuente, tampoco lo serán sus supersets; mientras que si un itemset es frecuente, también lo serán sus subconjuntos.

http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

In [4]:
te = TransactionEncoder()
te_ary = te.fit(recipe_ingredients_clean).transform(recipe_ingredients_clean)

df_transactions = pd.DataFrame(te_ary, columns=te.columns_)
df_transactions.head()

Unnamed: 0,accent,acorn squash,across bone inch thick,acrylic paints,active dried yeast,active yeast,active yeast star,active yeast warm water,acuavit,additional butter,...,zampone sausage,zest,zinfandel reduction sauce,zinfandel wine,ziti,zucchini,zucchini pieces,zucchini sliced rings,zucchinis,zuchinni
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
frequent_itemsets = apriori(df_transactions, min_support=0.03, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets[frequent_itemsets.length > 1]

Unnamed: 0,support,itemsets,length
34,0.031565,"[baking powder, flour]",2
35,0.03358,"[baking powder, milk]",2
36,0.051041,"[baking powder, salt]",2
37,0.047011,"[baking powder, sugar]",2
38,0.044997,"[butter, eggs]",2
39,0.063801,"[butter, flour]",2
40,0.057085,"[butter, milk]",2
41,0.098724,"[butter, salt]",2
42,0.065816,"[butter, sugar]",2
43,0.034251,"[butter, vanilla]",2


In [6]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules["antecedant_len"] = rules["antecedants"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedant_len
0,(milk),(flour),0.143721,0.149093,0.044325,0.308411,2.068578,0.022897,1.230365,1
1,(flour),(sugar),0.149093,0.258563,0.065816,0.441441,1.707289,0.027266,1.327412,1
2,(vegetable),(salt),0.063801,0.391538,0.030222,0.473684,1.209804,0.005241,1.156078,1
3,(vanilla),(sugar),0.084621,0.258563,0.051713,0.611111,2.363492,0.029833,1.906553,1
4,(cinnamon),(salt),0.055071,0.391538,0.030222,0.54878,1.401602,0.008659,1.348483,1
5,(vanilla),(salt),0.084621,0.391538,0.041639,0.492063,1.256745,0.008507,1.19791,1
6,(cinnamon),(sugar),0.055071,0.258563,0.032908,0.597561,2.311086,0.018669,1.842359,1
7,(vanilla),(butter),0.084621,0.202149,0.034251,0.404762,2.002294,0.017145,1.34039,1
8,"(sugar, flour)",(salt),0.065816,0.391538,0.036938,0.561224,1.433385,0.011168,1.386727,2
9,"(flour, salt)",(sugar),0.084621,0.258563,0.036938,0.436508,1.688209,0.015058,1.31579,2


<br>
# Fish

In [7]:
fish_words = ['fish', 'seafood', 'snapper', 'tuna', 'bluefish', 
              'shark', 'croaker', 'flounder', 'trout']

fish_recipes_idx = [i for i, e in enumerate(recipe_title_clean) for x in fish_words if x in e]
fish_recipes_ingredients = list(np.asarray(recipe_ingredients_clean)[fish_recipes_idx])

In [8]:
te = TransactionEncoder()
te_ary = te.fit(fish_recipes_ingredients).transform(fish_recipes_ingredients)

df_transactions = pd.DataFrame(te_ary, columns=te.columns_)

In [10]:
frequent_itemsets = apriori(df_transactions, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

rules["antecedant_len"] = rules["antecedants"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedant_len
0,(cornstarch),(salt),0.086957,0.326087,0.065217,0.75,2.3,0.036862,2.695652,1
1,(vinegar),(salt),0.108696,0.326087,0.065217,0.6,1.84,0.029773,1.684783,1
2,(leaf),(butter),0.065217,0.282609,0.065217,1.0,3.538462,0.046786,inf,1
3,(cornstarch),(vinegar),0.086957,0.108696,0.065217,0.75,6.9,0.055766,3.565217,1
4,(vinegar),(cornstarch),0.108696,0.086957,0.065217,0.6,6.9,0.055766,2.282609,1
5,(white wine),(butter),0.086957,0.282609,0.065217,0.75,2.653846,0.040643,2.869565,1
6,(fish stock),(butter),0.065217,0.282609,0.065217,1.0,3.538462,0.046786,inf,1
7,"(butter, flour)",(salt),0.086957,0.326087,0.065217,0.75,2.3,0.036862,2.695652,2
8,"(butter, salt)",(flour),0.108696,0.152174,0.065217,0.6,3.942857,0.048677,2.119565,2
9,"(flour, salt)",(butter),0.065217,0.282609,0.065217,1.0,3.538462,0.046786,inf,2
