In [1]:
import pandas as pd
import dask.dataframe as dd
import json

# Read datasets
The dataset does not fit into memory, so dask usage is essential

In [2]:
recipes1M = dd.read_csv('0-datasets/rec1M/rec1M-*.csv')
recipes1M = recipes1M.drop(['Unnamed: 0', 'id', 'partition'], axis=1)
recipes1M = recipes1M.rename(columns={"instructions":"directions", "url":"link"})

In [3]:
dataset = dd.read_csv('0-datasets/v-0.9.3-single.csv')

Parse lines in recipe1M dataset, stick to the common format  
1) deserialize to list  
2) map lists to remove dict-syntax  
3) serialize back to json-string

In [4]:
recipes1M.ingredients = recipes1M.ingredients.map(json.loads, meta=pd.Series([], dtype=object, name='ingredients'))
recipes1M.directions = recipes1M.directions.map(json.loads, meta=pd.Series([], dtype=object, name='directions'))

In [5]:
recipes1M.ingredients = recipes1M.ingredients.map(lambda x: list(map(lambda y: y['text'], x)), meta=pd.Series([], dtype=object, name='ingredients'))
recipes1M.directions = recipes1M.directions.map(lambda x: list(map(lambda y: y['text'], x)), meta=pd.Series([], dtype=object, name='directions'))

In [6]:
recipes1M.ingredients = recipes1M.ingredients.map(json.dumps, meta=pd.Series([], dtype=object, name='ingredients'))
recipes1M.directions = recipes1M.directions.map(json.dumps, meta=pd.Series([], dtype=object, name='directions'))

In [7]:
recipes1M.head()

Unnamed: 0,ingredients,directions,title,link
0,"[""6 ounces penne"", ""2 cups Beechers Flagship C...","[""Preheat the oven to 350 F. Butter or oil an ...",Worlds Best Mac and Cheese,http://www.epicurious.com/recipes/food/views/-...
1,"[""1 c. elbow macaroni"", ""1 c. cubed American c...","[""Cook macaroni according to package direction...",Dilly Macaroni Salad Recipe,http://cookeatshare.com/recipes/dilly-macaroni...
2,"[""8 tomatoes, quartered"", ""Kosher salt"", ""1 re...","[""Add the tomatoes to a food processor with a ...",Gazpacho,http://www.foodnetwork.com/recipes/gazpacho1.html
3,"[""2 12 cups milk"", ""1 12 cups water"", ""14 cup ...","[""Preheat oven to 350 degrees Fahrenheit."", ""S...",Crunchy Onion Potato Bake,http://www.food.com/recipe/crunchy-onion-potat...
4,"[""1 (3 ounce) package watermelon gelatin"", ""14...","[""Dissolve Jello in boiling water."", ""Allow to...",Cool 'n Easy Creamy Watermelon Pie,http://www.food.com/recipe/cool-n-easy-creamy-...


In [8]:
recipes1M.describe().compute()

Unnamed: 0,ingredients,directions,title,link
unique,1021415,1008591,809994,1028642
count,1029720,1029720,1029720,1029720
top,"[""1 paper, serviette (large)"", ""1 cloth, servi...","[""Put ingredients into bread machine and press...",Banana Bread,http://www.kraftrecipes.com/recipes/surejell-a...
freq,63,249,247,13


In [9]:
dataset.describe().compute()

Unnamed: 0,title,ingredients,directions,link
unique,904381,1703702,1684363,1727259
count,1727646,1727646,1727646,1727646
top,Chicken Casserole,"[""Ingredients""]","[""Mix all ingredients together.""]",https://www.allrecipes.com/recipe/215239/three...
freq,4129,55,259,4


# Some time with duplicates
Lets focus on duplicates from recipes1M, since they seem to be more attractive

# Dataset zu gross

In [10]:
dataset["source"] = "Gathered"
recipes1M["source"] = "Recipes1M"

In [11]:
total_dataset = dataset.append(recipes1M)

In [12]:
total_dataset.describe().compute()

Unnamed: 0,title,ingredients,directions,link,source
unique,1457864,2613089,2582155,2755901,2
count,2757366,2757366,2757366,2757366,2757366
top,Chicken Casserole,"[""1 paper, serviette (large)"", ""1 cloth, servi...","[""Mix all ingredients together.""]",http://www.kraftrecipes.com/recipes/surejell-a...,Gathered
freq,4188,63,290,13,1727646


In [13]:
total_dataset = total_dataset.loc[total_dataset.ingredients.map(json.dumps).map(len) > 0]
total_dataset = total_dataset.loc[total_dataset.directions.map(json.dumps).map(len) > 0]

# Basic duplicates removal

In [14]:
# unify urls
total_dataset.link = total_dataset.link.map(lambda x: x.replace('http://', ''))
total_dataset.link = total_dataset.link.map(lambda x: x.replace('https://', ''))

In [15]:
# numbers of link duplicates
link_duplicates = total_dataset.groupby('link').title.count().compute()
dup_links_set = set(link_duplicates.loc[link_duplicates > 1].index)
print('Duplicated values:', len(dup_links_set))
total_dataset.loc[total_dataset.link.map(lambda x: x in dup_links_set)].compute().shape

Duplicated values: 351450


(703058, 5)

### Prepare trainig dataset of positive duplicate pairs

In [16]:
dup_links_pair_set = set(link_duplicates.loc[link_duplicates == 2].index)
print('Duplicated values:', len(dup_links_pair_set))
train = total_dataset.loc[total_dataset.link.map(lambda x: x in dup_links_pair_set)].compute()
train.to_csv('train-positive-pairs.csv', index=False)

Duplicated values: 351325


### Drop recipes with duplicated links - deduplication step 1

In [17]:
total_dataset_nd = total_dataset.drop_duplicates(subset='link')
total_dataset_nd.describe().compute()

Unnamed: 0,title,ingredients,directions,link,source
unique,1363000,2332243,2344227,2405758,2
count,2405758,2405758,2405758,2405758,2405758
top,Chicken Casserole,"[""1 paper, serviette (large)"", ""1 cloth, servi...","[""Mix all ingredients together.""]",www.yummly.com/recipe/tortitas-de-papa_Potato-...,Gathered
freq,4176,63,278,1,1727259


In [18]:
# numbers of ingredients and directions duplicates
ing_inst_duplicates = total_dataset_nd.groupby(['ingredients', 'directions']).title.count().compute()
ing_inst_dup_set = set(map(lambda x: x[0] + x[1], list(ing_inst_duplicates.loc[ing_inst_duplicates > 1].index)))
print('Duplicated values:', len(ing_inst_dup_set))
total_dataset_nd.loc[(total_dataset_nd.ingredients + total_dataset_nd.directions).map(lambda x: x in ing_inst_dup_set)].compute().shape

Duplicated values: 21196


(43558, 5)

### Drop recipes with duplicated instructions and ingredients - deduplication step 2

In [19]:
total_dataset_nd = total_dataset_nd.drop_duplicates(subset=('ingredients', 'directions'))
total_dataset_nd.describe().compute()

Unnamed: 0,title,ingredients,directions,link,source
unique,1361969,2332243,2344227,2383396,2
count,2383396,2383396,2383396,2383396,2383396
top,Chicken Casserole,"[""1 paper, serviette (large)"", ""1 cloth, servi...","[""Mix all ingredients together.""]",www.yummly.com/recipe/tortitas-de-papa_Potato-...,Gathered
freq,4175,63,277,1,1710935


## Store intermidiate result

In [None]:
# store version without literal duplicates
total_dataset_nd.to_csv('0-datasets/v-1.0.0-beta-h/dataset-*.csv', index=False)