In [113]:
import xml.etree.ElementTree

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Reading and cleaning data

In [103]:
e = xml.etree.ElementTree.parse('recipeBaseCompulsory_clean.xml').getroot()

recipe_ingredients = []

for atype in e.findall('RECIPE'):
    ingredients = []
    
    for i in atype.findall('IN'):
        try:
            ingredients.append(i.text)
        except Exception:
            pass
    
    recipe_ingredients.append(ingredients)

In [107]:
tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()
stop_words = stopwords.words("english") 
stop_words.extend(['ounce', 'skinless', 'boneless', 'halves', 'cold', 'sized',
                   'cooked', 'unseasoned', 'colored', 'light', 'medium', 'thinly',
                   'coarsely', 'crushed', 'whole', 'recipe', 'pitted', 'bing'])

recipe_ingredients_clean = []

for rec_ing in recipe_ingredients:
    ingredients_clean = []
    
    for ing in rec_ing:
        t = ing.lower()
        t = re.sub("[\(\[].*?[\)\]]", "", t) # Remove brackets
        t = t.split(',')[0]
        t = t.split(';')[0]
        t = t.split(' or ')[0]
    #     t = porter.stem(t) # Stemming refers to the process of reducing each word to its root or base.

        words = tokenizer.tokenize(t)

        ingredients_clean.append(' '.join([word for word in words 
                                           if word not in stop_words
                                           and not word.isdigit() 
                                           and len(word) > 3]))
        
    ingredients_clean = list(filter(None, ingredients_clean))
    recipe_ingredients_clean.append(ingredients_clean)

recipe_ingredients_clean

[['apples', 'oranges', 'lemons', 'dill pickles', 'saladitos'],
 ['hamburger',
  'mild',
  'onion',
  'cloves garlic',
  'kidney beans',
  'uncooked pasta',
  'mushrooms',
  'corn',
  'tomato sauce',
  'italian seasoning',
  'pepper',
  'pepper',
  'basil',
  'favorite seasonings'],
 ['onions',
  'lean beef brisket',
  'cloves garlic',
  'ground pork',
  'green chiles',
  'tomato sauce',
  'tomatoes',
  'cumin',
  'salt',
  'oregano',
  'mustard',
  'tequila',
  'beer',
  'chili powder',
  'beef bouillon cubes'],
 ['crescent rolls',
  'cream cheese',
  'mayo',
  'hidden valley ranch dressing',
  'following vegetables'],
 ['graham cracker crumbs',
  'butter',
  'powdered sugar',
  'cream cheese',
  'eggs',
  'sugar',
  'vanilla',
  'sour cream',
  'canned filling'],
 ['yeast', 'flour', 'salt', 'sugar', 'dough enhancer', 'eggs', 'water'],
 ['rice',
  'urad',
  'chana',
  'yellow split peas',
  'salt',
  'chili powder',
  'onion',
  'carrot',
  'coconut'],
 ['milk', 'eggs', 'flour', 'salt'

<br>
# A priori

http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

In [114]:
te = TransactionEncoder()
te_ary = te.fit(recipe_ingredients_clean).transform(recipe_ingredients_clean)

df_transactions = pd.DataFrame(te_ary, columns=te.columns_)
df_transactions.head()

Unnamed: 0,accent,acorn squash,across bone inch thick,acrylic paints,active dried yeast,active yeast,active yeast star,active yeast warm water,acuavit,additional butter,...,zampone sausage,zest,zinfandel reduction sauce,zinfandel wine,ziti,zucchini,zucchini pieces,zucchini sliced rings,zucchinis,zuchinni
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [129]:
frequent_itemsets = apriori(df_transactions, min_support=0.03, use_colnames=True)

In [131]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets[frequent_itemsets.length > 1]

Unnamed: 0,support,itemsets,length
34,0.031565,"[baking powder, flour]",2
35,0.03358,"[baking powder, milk]",2
36,0.051041,"[baking powder, salt]",2
37,0.047011,"[baking powder, sugar]",2
38,0.044997,"[butter, eggs]",2
39,0.063801,"[butter, flour]",2
40,0.057085,"[butter, milk]",2
41,0.098724,"[butter, salt]",2
42,0.065816,"[butter, sugar]",2
43,0.034251,"[butter, vanilla]",2


<br>
# Association Rules

http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

In [132]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
rules

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(water),(salt),0.204164,0.391538,0.098724,0.483553,1.235008,0.018786,1.178169
1,(baking powder),(flour),0.07186,0.149093,0.031565,0.439252,2.946156,0.020851,1.51745
2,(eggs),(sugar),0.129617,0.258563,0.064473,0.497409,1.923747,0.030959,1.475231
3,"(butter, sugar)",(salt),0.065816,0.391538,0.039624,0.602041,1.537631,0.013854,1.528956
4,"(sugar, salt)",(butter),0.126259,0.202149,0.039624,0.31383,1.552467,0.014101,1.162759
5,"(butter, salt)",(sugar),0.098724,0.258563,0.039624,0.401361,1.552275,0.014098,1.238537
6,(vanilla),(salt),0.084621,0.391538,0.041639,0.492063,1.256745,0.008507,1.19791
7,(milk),(sugar),0.143721,0.258563,0.057085,0.397196,1.536169,0.019924,1.229981
8,(butter),(sugar),0.202149,0.258563,0.065816,0.325581,1.259197,0.013548,1.099372
9,(cinnamon),(salt),0.055071,0.391538,0.030222,0.54878,1.401602,0.008659,1.348483


In [133]:
rules["antecedant_len"] = rules["antecedants"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedant_len
0,(water),(salt),0.204164,0.391538,0.098724,0.483553,1.235008,0.018786,1.178169,1
1,(baking powder),(flour),0.07186,0.149093,0.031565,0.439252,2.946156,0.020851,1.51745,1
2,(eggs),(sugar),0.129617,0.258563,0.064473,0.497409,1.923747,0.030959,1.475231,1
3,"(butter, sugar)",(salt),0.065816,0.391538,0.039624,0.602041,1.537631,0.013854,1.528956,2
4,"(sugar, salt)",(butter),0.126259,0.202149,0.039624,0.31383,1.552467,0.014101,1.162759,2
5,"(butter, salt)",(sugar),0.098724,0.258563,0.039624,0.401361,1.552275,0.014098,1.238537,2
6,(vanilla),(salt),0.084621,0.391538,0.041639,0.492063,1.256745,0.008507,1.19791,1
7,(milk),(sugar),0.143721,0.258563,0.057085,0.397196,1.536169,0.019924,1.229981,1
8,(butter),(sugar),0.202149,0.258563,0.065816,0.325581,1.259197,0.013548,1.099372,1
9,(cinnamon),(salt),0.055071,0.391538,0.030222,0.54878,1.401602,0.008659,1.348483,1
