In [33]:
import os
import datetime
import pandas as pd
from pandas import json_normalize
import numpy as np
import re
import json
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
import pprint
from string import punctuation
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD

In [34]:
# define functions

punctuation = set(punctuation) # speeds up comparison
tw_punct = punctuation


def descriptive_stats(tokens, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens,
        number of characters, lexical diversity, and num_tokens most common
        tokens. Return a list of
    """
    num_tokens=len(tokens)
    num_unique_tokens = len(set(tokens))
    lexical_diversity = num_unique_tokens/num_tokens
    num_characters = sum(len(token) for token in tokens)

    if verbose :
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")

        # print the five most common tokens
        counter = Counter(tokens)
        top_5_tokens = counter.most_common(5)
        print("Top 5 most common tokens:")
        for token, count in top_5_tokens:
            print(f"{token}: {count} occurrences")

    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])
def remove_stop(tokens) :
    return [token for token in tokens if token not in sw]
    return(tokens)

def remove_punctuation(text, punct_set=tw_punct) :
    """
        Function takes two arguments: (1) text, which is the input string, and (2) the punctuation set, which is set to the tw_punct value set.
        Returns all characters not found in the punctuation set and concatenates them back into a string using the .join() method with an empty
        string "" as the separator.
    """
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) :
    """
        Splitting on whitespace rather than the book's tokenize function. That
        function will drop tokens like '#hashtag' or '2A', which we need for Twitter.
    """
    tokens = text.split()
    return(tokens)

def prepare(text, pipeline) :
    tokens = str(text)

    for transform in pipeline :
        tokens = transform(tokens)

    return(tokens)
def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [35]:
from matplotlib import pyplot as plt

def wordcloud(word_freq, title=None, max_words=200, stopwords=None):

    wc = WordCloud(width=800, height=400,
                   background_color= "black", colormap="Paired",
                   max_font_size=150, max_words=max_words)

    # convert data frame into dict
    if type(word_freq) == pd.Series:
        counter = Counter(word_freq.fillna(0).to_dict())
    else:
        counter = word_freq

    # filter stop words in frequency counter
    if stopwords is not None:
        counter = {token:freq for (token, freq) in counter.items()
                              if token not in stopwords}
    wc.generate_from_frequencies(counter)

    plt.title(title)

    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")


def count_words(df, column='titles_tokens', preprocess=None, min_freq=2):

    # process tokens and update counter
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)

    # create counter and run through all data
    counter = Counter()
    df[column].map(update)

    # transform counter into data frame
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq >= @min_freq')
    freq_df.index.name = 'token'

    return freq_df.sort_values('freq', ascending=False)

In [36]:
# data import
recipes = pd.read_csv("all_recipes.csv")
#recipes=pd.DataFrame(recipes)
#recipes['title']=recipes['title'].astype(str)
#recipes['ingredients']=recipes['ingredients'].astype(str)
#recipes['step']=recipes['step'].astype(str)
#recipes.dtypes

In [37]:
recipes.head(5)

Unnamed: 0,Recipe,URL,Ingredients
0,Michela’s tuna with cannellini beans (no cook),http://www.jamieoliver.com/recipes/fish-recipe...,1 x 400 g tin of cannellini beans\n1 x 80 g ti...
1,Haddock with cannellini beans & artichokes,https://www.bbcgoodfood.com/recipes/haddock-ca...,"400g can cannellini beans , drained and rinsed..."
2,Grilled Bruschetta - Cannellini Beans with Fet...,https://food52.com/recipes/10069-grilled-brusc...,1 loaf bread\n1 15 oz. can cannellini beans\n8...
3,Escarole with Cannellini Beans,https://www.epicurious.com/recipes/food/views/...,"1 sweet onion, halved\n1 head of garlic, halve..."
4,Broccoli Rabe with Cannellini Beans,http://www.eatingwell.com/recipe/255758/brocco...,"1 bunch broccoli rabe (1-1¼ pounds), trimmed a..."


In [38]:
# store tokens in new dataframe 'df'
df=pd.DataFrame()

# fold to lowercase
df['Recipe_tokens']=recipes['Recipe'].str.lower()
df['Ingredients_tokens']=recipes['Ingredients'].str.lower()
df['URL_tokens']=recipes['URL'].str.lower()

# remove punctuation
df['Ingredients_tokens']=df['Ingredients_tokens'].apply(remove_punctuation)
df['URL_tokens']=df['URL_tokens'].apply(remove_punctuation)
df['Recipe_tokens']=df['Recipe_tokens'].apply(remove_punctuation)

# tokenize
df['Recipe_tokens']=tokenize(df['Recipe_tokens'].str)
df['Ingredients_tokens']=tokenize(df['Ingredients_tokens'].str)
df['URL_tokens']=tokenize(df['URL_tokens'].str)

df

Unnamed: 0,Recipe_tokens,Ingredients_tokens,URL_tokens
0,"[michela’s, tuna, with, cannellini, beans, no,...","[1, x, 400, g, tin, of, cannellini, beans, 1, ...",[httpwwwjamieolivercomrecipesfishrecipesmichel...
1,"[haddock, with, cannellini, beans, artichokes]","[400g, can, cannellini, beans, drained, and, r...",[httpswwwbbcgoodfoodcomrecipeshaddockcannellon...
2,"[grilled, bruschetta, cannellini, beans, with,...","[1, loaf, bread, 1, 15, oz, can, cannellini, b...",[httpsfood52comrecipes10069grilledbruschettaca...
3,"[escarole, with, cannellini, beans]","[1, sweet, onion, halved, 1, head, of, garlic,...",[httpswwwepicuriouscomrecipesfoodviewsescarole...
4,"[broccoli, rabe, with, cannellini, beans]","[1, bunch, broccoli, rabe, 11¼, pounds, trimme...",[httpwwweatingwellcomrecipe255758broccolirabew...
...,...,...,...
1015,"[saucy, pita, dippers]","[1, wholewheat, pita, bread, ¼, cup, spaghetti...",[httpwwweatingwellcomrecipe259417saucypitadipp...
1016,"[parmesan, pita, crisps]","[1, 6inch, wholewheat, pita, split, 1, tablesp...",[httpswwwrealsimplecomfoodrecipesbrowseallreci...
1017,"[pita, toasts]","[3, pita, breads, 6, tablespoons, unsalted, bu...",[httpwwwcookstrcomrecipespitatoastsdebbiemeyer...
1018,"[pita, nachos]","[1, regular, size, 6inch, wholewheat, pita, ⅓,...",[httpwwwdelishcomcookingrecipeideasrecipesa169...


In [39]:
Recipe_combined_tokens = [token for sublist in df['Recipe_tokens'] for token in sublist]
#descriptive_stats(sza_combined_tokens)

descriptive_stats(Recipe_combined_tokens)

There are 3771 tokens in the data.
There are 907 unique tokens in the data.
There are 22353 characters in the data.
The lexical diversity is 0.241 in the data.
Top 5 most common tokens:
with: 208 occurrences
and: 130 occurrences
recipes: 83 occurrences
chicken: 55 occurrences
beans: 51 occurrences


[3771, 907, 0.24051975603288253, 22353]

In [40]:
len(df)

1020

In [41]:
# Fitting an LDA Model

# Count Vectorizer
count_recipe_vectorizer = CountVectorizer(stop_words='english', min_df=5, max_df=0.7)
count_recipe_vectors = count_recipe_vectorizer.fit_transform(recipes['Ingredients'])

# TF-IDF Vectorizer
tfidf_recipe_vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.7)
tfidf_recipe_vectors = tfidf_recipe_vectorizer.fit_transform(recipes['Ingredients'])

# LDA Model
lda_recipe_model = LatentDirichletAllocation(n_components=5, random_state=314)
W_lda_recipe_matrix = lda_recipe_model.fit_transform(count_recipe_vectors)
H_lda_recipe_matrix = lda_recipe_model.components_


# Display topics for Count Vectorizer
print("Topics for Count Vectorizer:")
display_topics(lda_recipe_model, count_recipe_vectorizer.get_feature_names_out())


Topics for Count Vectorizer:

Topic 00
  butter (3.27)
  cup (3.07)
  chopped (2.54)
  inch (2.18)
  grated (2.13)

Topic 01
  cup (11.27)
  teaspoon (5.01)
  cups (4.94)
  sugar (4.42)
  tablespoons (3.45)

Topic 02
  tsp (5.34)
  tbsp (5.25)
  salt (2.73)
  oil (2.69)
  powder (1.74)

Topic 03
  cup (6.47)
  chopped (5.11)
  teaspoon (3.78)
  fresh (3.39)
  finely (2.26)

Topic 04
  pepper (3.91)
  salt (3.86)
  oil (3.57)
  teaspoon (3.42)
  olive (2.90)


In [42]:
# # Fitting an LSA Model

svd_recipe_model = TruncatedSVD(n_components = 5, random_state=314)
W_svd_recipe_matrix = svd_recipe_model.fit_transform(count_recipe_vectors)
H_svd_recipe_matrix = svd_recipe_model.components_
# Display topics for Count Vectorizer
display_topics(svd_recipe_model, count_recipe_vectorizer.get_feature_names_out())


Topic 00
  cup (5.25)
  teaspoon (4.02)
  chopped (2.79)
  salt (2.73)
  tablespoons (2.41)

Topic 01
  cup (28.35)
  sugar (4.58)
  water (1.98)
  cups (1.80)
  coconut (1.72)

Topic 02
  teaspoon (31.33)
  tablespoon (4.22)
  seeds (3.98)
  ground (2.74)
  dried (2.66)

Topic 03
  chopped (31.56)
  finely (10.02)
  teaspoon (9.67)
  cup (6.55)
  red (4.86)

Topic 04
  tsp (78.47)
  tbsp (67.02)
  teaspoon (40.71)
  powder (26.63)
  cup (23.03)
