# read csv

In [1]:
import pandas as pd
df = pd.read_csv("/kaggle/input/recipe-dataset-over-2m/recipes_data.csv")

In [2]:
print(len(df))

2231142


# remove data contains null

In [3]:
df = df.dropna(how='any', axis=0)

In [4]:
print(len(df))

2231141


In [5]:
df.head(5)

Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com


# load fasttext model

In [6]:
# install fasttext model
# https://fasttext.cc/docs/en/crawl-vectors.html

# load model and measure word mover's distance with gensim
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html

import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English

import gensim
bin_path = '/kaggle/working/cc.en.300.bin'
model = gensim.models.fasttext.load_facebook_model(bin_path)

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

# create counter

In [7]:
class Counter:
    _instance = None
    _count = 0

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(Counter, cls).__new__(cls)
        return cls._instance

    def increment(self):
        Counter._count += 1
        return Counter._count

    @classmethod
    def get_count(cls):
        return cls._count
    
    @classmethod
    def reset_count(cls):
        cls._count = 0
        return cls._count

counter = Counter()

In [8]:
# pot library is necessary to execute wmdistance func
!pip install POT

Collecting POT
  Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (835 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m835.4/835.4 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.4


# define classification logic

In [9]:
from gensim.parsing.preprocessing import STOPWORDS
from typing import List

def preprocess(text: str) -> List:
    # Tokenization and lowercasing
    tokens = gensim.utils.simple_preprocess(text.lower())
    tokens = [word for word in tokens if word not in STOPWORDS]
    return tokens    
    
def categorize(text: str) -> str:
    counter.increment()
    if counter.get_count() % 1000 == 0:
        print(counter.get_count(), end='\r')
    
    tokens = preprocess(text)
    
    # Compare distance to "sweets" and "meals"
    dist_sweets = model.wv.wmdistance(tokens, preprocess("sweets snacks desserts"))
    dist_meals = model.wv.wmdistance(tokens, preprocess("meals dishes food"))

    if dist_sweets < dist_meals:
        return "sweets"
    else:
        return "meals"

In [10]:
import time

In [11]:
counter.reset_count()

start = time.time()
label_with_title = df["title"].apply(categorize)
end = time.time()
print(end - start)

3090.9321908950806


In [12]:
label_with_title.to_csv("label_with_title.csv", index = False)

In [13]:
counter.reset_count()

start = time.time()
label_with_directions = df["directions"].apply(categorize)
end = time.time()
print(end - start)

5347.410380125046


In [14]:
label_with_directions.to_csv("label_with_directions.csv", index = False)

In [15]:
counter.reset_count()

start = time.time()
label_with_ner = df["NER"].apply(categorize)
end = time.time()
print(end - start)

3637.7533571720123


In [16]:
label_with_ner.to_csv("label_with_ner.csv", index = False)

In [17]:
all_labels = pd.concat(
    [
        label_with_title,
        label_with_directions,
        label_with_ner
    ],
    axis=1
)
all_labels.to_csv("all_labels.csv", index = False)

In [18]:
df["title"][(all_labels=="sweets").sum(axis=1)==2].head(30)

2                           Creamy Corn
7                        Scalloped Corn
14         Pink Stuff(Frozen Dessert)  
16           Easy German Chocolate Cake
19     Eggless Milkless Applesauce Cake
29                       One Hour Rolls
35                        Artichoke Dip
38                      Pear-Lime Salad
39              Watermelon Rind Pickles
44                          Fruit Pizza
54                Dave'S Corn Casserole
66                  Frozen Peach Mousse
68                   Blueberry Surprise
69              Moist Devil'S Food Cake
70                 Dot'S Civil War Cake
79                         Mulled Cider
82         Cranberry-Apple-Orange Salad
91                           Pound Cake
96                 Sweet-N-Sour Chicken
97                             Ambrosia
100                            Pancakes
102              Red Cabbage And Apples
105                 Microwave Spice Tea
112                     Cheese-Ham Ball
118                           Chess Pie
