In [11]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2

import os
import sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/' 
sys.path.append(parent_dir) 

import pandas as pd
import numpy as np
from copy import deepcopy

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, auto_save_csv, print_time, auto_save_pickle

from models.fmin2 import fmin2
from models.nested_validation import *
from models.features import fixed_makedata, salvador_wrap, pretrained_wrap, FastText, Word2Vec, glove_wrap, doc2vec_wrap
from models.display import pickle2df
import multiprocessing

# models
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.filterwarnings("ignore")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.06 µs
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
time: 65.9 ms


In [2]:
### load the Recipe55k and derive Recipe54k
dic = load('../data/dic_20190819.pickle')

ls = [i for i,v in dic.items() if len(v['ingredients'])>1]
print('drop %d recipes with less than 2 ingredients' %(len(dic)-len(ls)))
ls = [i for i in ls if len(dic[i]['directions'])>1]
print('furthur drop %d recipes with less than 2 instructions' %(len(dic)-len(ls)))

exist
drop 46 recipes with less than 2 ingredients
furthur drop 1026 recipes with less than 2 instructions
time: 14.2 s


In [5]:
corp = fixed_makedata(dic, ls, tag = 'GI')

time: 3min 38s


In [6]:
space = {'params':{'window': [5, 15, 25, 40],
                'alpha': [0.025, 0.5],
                'iter': [25, 100]}
        }

time: 31.7 ms


In [7]:
def recipe_word_embedding(fn, modelname):
    '''
    fn: Word2Vec of gensim ... etc
    modelname: string: word2vec ...etc
    '''
    max_evals = 8
    np.random.seed(5)
    rd = np.random.randint(30, size=1000)
    historical = []
    j, num, best_loss = 0, 0, 0
    p2 = space['params']
    while num < max_evals:
        new_p2 = {}
        for k, v in p2.items():
            if type(v) == list:
                rand_idx = int(rd[j]%len(v))
                new_p2[k] = v[rand_idx]
                j+=1
            else:
                new_p2[k] = v
            if j == 900:
                j = 0
        if new_p2 not in historical:
            new_space = space
            new_space.update({'p2':new_p2})
            historical.append(new_p2)
            # use new_space and do something            
            num+=1
            print(num, new_p2)
            params_default = {'size': 300, 'window': 40, 'min_count': 5,'workers': max(1, multiprocessing.cpu_count() - 10)}
            params_default.update(new_p2)

            if modelname in ['glove']:
                model = fn(params_default)
                model.fit(corp.corpus_list)
                save_pickle('../data/%s_%.2d'%(modelname,num), model)
                
            if modelname in ['doc2vec']:
                params = {'main':params_default}
                model = fn(params)
                model.fit(corp.corpus_list)
                save_pickle('../data/%s_%.2d'%(modelname,num), model)
                
            if modelname in ['word2vec','fasttext']:
                model = fn(corp.corpus_list, **params_default)
                model.wv.save_word2vec_format('../data/%s_%.2d.bin'%(modelname,num), binary=True)

time: 90.2 ms


In [9]:
recipe_word_embedding(Word2Vec, 'word2vec')

1 {'window': 40, 'alpha': 0.025, 'iter': 100}
2 {'window': 15, 'alpha': 0.025, 'iter': 25}
3 {'window': 5, 'alpha': 0.5, 'iter': 25}
4 {'window': 5, 'alpha': 0.5, 'iter': 100}
5 {'window': 5, 'alpha': 0.025, 'iter': 100}
6 {'window': 15, 'alpha': 0.5, 'iter': 25}
7 {'window': 40, 'alpha': 0.5, 'iter': 100}
8 {'window': 25, 'alpha': 0.5, 'iter': 100}
time: 1h 34min 2s


In [12]:
recipe_word_embedding(FastText, 'fasttext')

1 {'window': 40, 'alpha': 0.025, 'iter': 100}
2 {'window': 15, 'alpha': 0.025, 'iter': 25}
3 {'window': 5, 'alpha': 0.5, 'iter': 25}
4 {'window': 5, 'alpha': 0.5, 'iter': 100}
5 {'window': 5, 'alpha': 0.025, 'iter': 100}
6 {'window': 15, 'alpha': 0.5, 'iter': 25}
7 {'window': 40, 'alpha': 0.5, 'iter': 100}
8 {'window': 25, 'alpha': 0.5, 'iter': 100}
time: 16h 52min 20s


In [13]:
recipe_word_embedding(glove_wrap, 'glove')

1 {'window': 40, 'alpha': 0.025, 'iter': 100}
2 {'window': 15, 'alpha': 0.025, 'iter': 25}
3 {'window': 5, 'alpha': 0.5, 'iter': 25}
4 {'window': 5, 'alpha': 0.5, 'iter': 100}
5 {'window': 5, 'alpha': 0.025, 'iter': 100}
6 {'window': 15, 'alpha': 0.5, 'iter': 25}
7 {'window': 40, 'alpha': 0.5, 'iter': 100}
8 {'window': 25, 'alpha': 0.5, 'iter': 100}
time: 31min 27s


In [14]:
recipe_word_embedding(doc2vec_wrap, 'doc2vec')

1 {'window': 40, 'alpha': 0.025, 'iter': 100}
2 {'window': 15, 'alpha': 0.025, 'iter': 25}
3 {'window': 5, 'alpha': 0.5, 'iter': 25}
4 {'window': 5, 'alpha': 0.5, 'iter': 100}
5 {'window': 5, 'alpha': 0.025, 'iter': 100}
6 {'window': 15, 'alpha': 0.5, 'iter': 25}
7 {'window': 40, 'alpha': 0.5, 'iter': 100}
8 {'window': 25, 'alpha': 0.5, 'iter': 100}
time: 1h 42min 50s


### prepare the skip thoughts vectors

In [9]:
def find_max_sentence_length():
    prev_max = 0
    for i, v in dic.items():
        if i in ls:
            recipe = v['name_UNK2_none'] + v['ingredients_UNK2_none'] + v['directions_UNK2_none']
            prev_max = max(prev_max, max([len(line) for line in recipe]))
    return prev_max
                       
find_max_sentence_length()

71

time: 39 s


In [10]:
def export(filename, max_sent, targets, overwrite = True):
    space = ' '.join(['NA']*max_sent)
    make_dir(filename)
    if os.path.isfile(filename) == True and overwrite == False:
        print('already exists'+filename)
    else:
        f = open(filename, 'w')
        for i, v in dic.items():
            recipe = []
            for tar in targets:
                recipe += v[tar]
            for line in recipe:
                f.write(' '.join(line)+'\n')
            f.write(space+'\n') 
        f.close()
        
filename = '../data/%s_%.2d.csv' % ('skip_thoughts_corpus',71)     
export(filename, 71, ['name_UNK2_none','ingredients_UNK2_none', 'directions_UNK2_none'], overwrite = True)

time: 2.09 s
