# Playing with the dataset


In [1]:
# Importing Python libraries
import pandas
import sys
import re
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Importing our own files
sys.path.append('../source')
import unpack

%matplotlib inline

## Unpacking the training data

In [2]:
train = unpack.unpack('train.json')

## Putting data into pandas objects

In [3]:
ingredients = []
for dish in train:
    ingredients.extend(dish['ingredients'])

print('Number of ingredients: ', len(ingredients))

Number of ingredients:  428275


In [4]:
unique = set(ingredients)
print('Unique ingredients: ', len(unique))

Unique ingredients:  6714


In [24]:
# put ingredients into pandas Series
series = pandas.Series(ingredients)
count = series.value_counts()
count[(count > 50) & (count < 100)]

gruyere cheese                     99
boiling potatoes                   99
old bay seasoning                  98
seasoned bread crumbs              98
broth                              97
sunflower oil                      97
fresh mozzarella                   97
pears                              97
fresh shiitake mushrooms           96
dill                               96
canned low sodium chicken broth    96
nori                               96
greens                             96
red kidney beans                   95
grated orange peel                 95
dried currants                     95
clams                              94
chicken drumsticks                 94
cognac                             94
adobo sauce                        94
whole wheat tortillas              93
corn husks                         93
orange bell pepper                 93
blanched almonds                   93
rice vermicelli                    93
vanilla ice cream                  93
leaves      

## Reducing the number of unique values

In [6]:
def apply_and_count(func, series):
    return series.apply(func).value_counts().count()

def reduction(original, new):
    return round(100 * (original - new) / original, 2)

def effectiveness(func, series=series, total=6714):
    new_total = apply_and_count(func, series)
    print('New total: {} (reduced {}, {}%)'.format(new_total, total - new_total, reduction(total, new_total)))

First, we can try converting all words to lower case.

In [7]:
effectiveness(str.lower)

New total: 6703 (reduced 11, 0.16%)


This is not especially useful. Let's try removing trademarks, registered symbols and copyrights.

In [8]:
effectiveness(lambda s: re.sub('(?:\w+)(®|™|©)', '', s).strip())

New total: 6714 (reduced 0, 0.0%)


This is not a good approach!

Let's try removing all measurements and numbers.

In [9]:
remove = [
    "cup", "cups", "teaspoon", "teaspoons", "tablespoon", "tablespoons",
    "ounce", "ounces", "lb", "lbs", "tbs", "tsp", "oz ", "oz." "handful", 
    "inch", "can"
]

def drop_units(s):
    for r in remove:
        s = s.replace(r, '')
    return re.sub('\d+/?\d*', '', s).strip()

effectiveness(drop_units)

New total: 6713 (reduced 1, 0.01%)


This approach is also not especially great.

We will probably have to just drop ingredients that fall beneath a certain threshold; say 20 incidences?

In [10]:
counts = series.value_counts()

thresholds = {}
for n in [10, 20, 50, 100, 250, 500, 1000]:
    thresholds[n] = counts[counts > n].count()

