# Playing with the dataset


In [49]:
# Importing Python libraries
import pandas
import sys
import re
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Importing our own files
sys.path.append('../source')
import unpack

%matplotlib inline

## Unpacking the training data

In [6]:
train = unpack.unpack('train.json')

## Putting data into pandas objects

In [7]:
ingredients = []
for dish in train:
    ingredients.extend(dish['ingredients'])

print('Number of ingredients: ', len(ingredients))

Number of ingredients:  428275


In [8]:
unique = set(ingredients)
print('Unique ingredients: ', len(unique))

Unique ingredients:  6714


In [20]:
# put ingredients into pandas Series
series = pandas.Series(ingredients)

## Reducing the number of unique values

In [47]:
def apply_and_count(func, series):
    return series.apply(func).value_counts().count()

def reduction(original, new):
    return round(100 * (original - new) / original, 2)

def effectiveness(func, series=series, total=6714):
    new_total = apply_and_count(func, series)
    print('New total: {} (reduced {}, {}%)'.format(new_total, total - new_total, reduction(total, new_total)))

First, we can try converting all words to lower case.

In [48]:
effectiveness(str.lower)

New total: 6703 (reduced 11, 0.16%)


This is not especially useful. Let's try removing trademarks, registered symbols and copyrights.

In [52]:
effectiveness(lambda s: re.sub('(?:\w+)(®|™|©)', '', s).strip())

New total: 6714 (reduced 0, 0.0%)


This is not a good approach!

Let's try removing all measurements and numbers.

In [57]:
remove = [
    "cup", "cups", "teaspoon", "teaspoons", "tablespoon", "tablespoons",
    "ounce", "ounces", "lb", "lbs", "tbs", "tsp", "oz ", "oz." "handful", 
    "inch", "can"
]

def drop_units(s):
    for r in remove:
        s = s.replace(r, '')
    return re.sub('\d+/?\d*', '', s).strip()

effectiveness(drop_units)

New total: 6713 (reduced 1, 0.01%)


This approach is also not especially great.

We will probably have to just drop ingredients that fall beneath a certain threshold; say 20 incidences?

In [None]:
counts = series.value_counts()

thresholds = {}
for n in [10, 20, 50, 100, 250, 500, 1000]:
    thresholds[n] = counts[counts > n].count()

