## Exploring CorEx on Recipe Topic Generation

Guide [here](https://github.com/gregversteeg/corex_topic/blob/master/corextopic/example/corex_topic_example.ipynb)



In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as ss
import matplotlib.pyplot as plt

import corextopic.corextopic as ct
import corextopic.vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [4]:
df = pd.read_csv('Recipe_Recommendation/dataset/Recipe_sample_dataset_small_heroclean.csv')

In [5]:
df.drop(columns=['Unnamed: 0','Unnamed: 0.1'],inplace=True)

In [6]:
df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER,clean_ingredients,clean_directions,clean_ingredients_spice_r,clean_directions_spice_r
0,Apple Dip,"[""1 (7 oz.) jar Marshmallow Creme"", ""1 (8 oz.)...","[""Bring cream cheese to room temperature. Mix ...",www.cookbooks.com/Recipe-Details.aspx?id=980881,Gathered,"[""Marshmallow Creme"", ""cream cheese""]",jar marshmallow creme pkg cream cheese philade...,bring cream cheese room temperature mix marshm...,jar marshmallow creme cream cheese philadel...,bring cream cheese temperature mix marshmallo...
1,Wasabi Whipped Potatoes,"[""1 1/2 lb. russet or Yukon gold potatoes, pee...","[""Place potatoes in large saucepan, cover with...",www.vegetariantimes.com/recipe/wasabi-whipped-...,Recipes1M,"[""russet"", ""low-fat milk"", ""butter"", ""wasabi p...",russet yukon gold potato inch cube low fat m...,place potato large saucepan cover cold water s...,russet yukon gold potato fat milk rice ...,place potato saucepan cold rinse away star...
2,Caramel Breakfast Rolls,"[""2 (1 lb.) loaves frozen bread dough, thawed""...","[""Cut 1 loaf of bread into small pieces; place...",www.cookbooks.com/Recipe-Details.aspx?id=537560,Gathered,"[""bread"", ""brown sugar"", ""regular vanilla pudd...",loaf bread dough brown sugar pkg regular van...,loaf bread small piece place piece greased x x...,loaf frozen bread dough firmly brown sugar...,loaf bread place greased combine brow...
3,Egg And Sausage Muffins,"[""1/4 cup cream (half & half)"", ""12 large eggs...","[""Preheat oven to 350 \u00b0F (175 \u00b0C). S...",www.epicurious.com/recipes/member/views/egg-an...,Gathered,"[""cream"", ""eggs"", ""pepper"", ""salt"", ""pork saus...",cream half half large egg pepper salt pork...,preheat oven f spray muffin tin spray like pa...,cream egg pork sausage,preheat oven f muffin tin pam bundt mu...
4,Taco Soup,"[""2 lb. lean ground beef"", ""1 small onion, cho...","[""Brown ground beef and onions; drain fat."", ""...",www.cookbooks.com/Recipe-Details.aspx?id=288957,Gathered,"[""lean ground beef"", ""onion"", ""pintos"", ""kidne...",lean ground beef small onion bean l bean t...,brown ground beef onion fat boil heat simmer ...,beef onion pinto kidney bean l lim...,brown beef onion fat remaining bring boil ...


In [10]:
df.drop(index=10538,inplace=True)

In [11]:
# Transform data into a sparse matrix
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True)
doc_word = vectorizer.fit_transform(df['clean_ingredients_spice_r'])
doc_word = ss.csr_matrix(doc_word)

doc_word.shape # n_docs x m_words

(22310, 6719)

In [12]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names()))

In [13]:
not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(words) if not word.isdigit()]

doc_word.shape # n_docs x m_words

(22310, 6719)

In [24]:
topic_model = ct.Corex(n_hidden=20, words=words, max_iter=200, verbose=False)
topic_model.fit(doc_word, words=words);

In [25]:
# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: onion, chicken, green, tomato, soup, celery, mushroom, beef, broth, breast
1: flour, purpose, egg, soda, powder, butter, shortening, yeast, buttermilk, self
2: vanilla, chocolate, extract, chip, cocoa, graham, semi, powdered, unsweetened, pudding
3: oil, olive, vegetable, virgin, black, red, canola, wine, margarine, shallot
4: cheese, cheddar, parmesan, cream, jack, cottage, monterey, swiss, velveeta, sour
5: smith, granny, old, creme, liqueur, fraiche, fashioned, tequila, day, bitter
6: whip, cool, pineapple, orange, strawberry, jello, gelatin, jell, cherry, mandarin
7: cilantro, tortilla, jalapeno, salsa, chile, lettuce, taco, corn, fish, avocado
8: juice, lemon, lime, zest, juiced, rind, squeezed, wedge, peel, zested
9: freshly, artichoke, heart, stem, white, parmigiano, reggiano, sea, leek, pine
10: sauce, mustard, dijon, pork, steak, ketchup, hot, chili, roast, tabasco
11: shrimp, noodle, deveined, pasta, sesame, sprout, lasagna, scallion, penne, chinese
12: sugar, brown, granu

In [26]:
topic_model.tc

5.09000239940469

In [24]:
topic_model10 = ct.Corex(n_hidden=10, words=words, max_iter=200, verbose=False, seed=1)
topic_model10.fit(doc_word, words=words);

In [25]:
# Print all topics from the CorEx topic model
topics10 = topic_model10.get_topics()
for n,topic in enumerate(topics10):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: sugar, onion, vanilla, chicken, baking, flour, soda, green, chocolate, cheese
1: cup, tablespoons, teaspoon, tsp, cups, tbsp, tablespoon, teaspoons, ounces, oz
2: pepper, garlic, minced, black, ground, cloves, salt, tomatoes, sauce, chopped
3: olive, oil, fresh, virgin, freshly, extra, kosher, dried, cilantro, thyme
4: cut, leaves, bay, wine, leaf, inch, soy, pieces, sesame, stock
5: red, vinegar, butter, milk, beans, eggs, chili, mustard, peppers, melted
6: whip, pie, cool, pudding, graham, mix, whipped, cream, crust, cracker
7: purpose, room, temperature, packed, unsalted, brown, oats, granulated, firmly, yeast
8: juice, lemon, orange, lime, fat, low, zest, sodium, yogurt, reduced
9: thinly, sliced, peeled, large, spray, quartered, squash, vegetable, yellow, cored


In [26]:
topic_model10.tc

7.993253174446666

#### Notes:

Trying hierarchical

In [47]:
topic_model3 = ct.Corex(n_hidden=10, words=words)
topic_model3.fit(doc_word, words=words, anchors=[['sugar','soda','flour','powder','vanilla'],'tofu',\
                                                 ['chicken','beef','pork','seafood','shrimp','crab','fish']],\
                 anchor_strength=2);

In [48]:
topic_model3.tc

9.457210363725316

In [49]:
# Print all topics from the CorEx topic model
topics3 = topic_model3.get_topics()
for n,topic in enumerate(topics3):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: sugar, vanilla, flour, soda, baking, pepper, eggs, extract, powder, butter
1: olive, oil, fresh, red, black, cloves, virgin, freshly, leaves, extra
2: chicken, beef, garlic, sauce, pork, broth, minced, shrimp, ground, lb
3: fat, low, free, sodium, spray, reduced, cooking, use, recipe, follows
4: onion, green, chopped, tomatoes, sliced, onions, diced, celery, tomato, medium
5: whip, salt, cool, pudding, graham, mix, cake, whipped, cream, pineapple
6: juice, lemon, orange, lime, zest, rind, peel, juiced, mint, apple
7: cheese, cheddar, parmesan, shredded, grated, mozzarella, sharp, dressing, italian, jack
8: parsley, thyme, dried, leaf, bay, dijon, mustard, basil, flat, dill
9: cup, tablespoons, teaspoon, tsp, cups, tablespoon, tbsp, teaspoons, ounces, oz
