In [101]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import ks_2samp

In [68]:
recipes = pd.read_csv("./data/RAW_recipes.csv")
interactions = pd.read_csv("./data/RAW_interactions.csv")

In [69]:
merged_df = recipes.merge(interactions,left_on="id",right_on="recipe_id",how="left")
subset_merged = merged_df[["id","rating"]]
subset_merged.loc[subset_merged["rating"] == 0,"rating"] = np.nan
right = subset_merged.groupby("id")["rating"].mean()
recipes = recipes.merge(right,on="id")

In [70]:
# Cleaning
recipes["tags"] = recipes["tags"].str.replace("\[|\]","",regex=True).str.split(",",expand=True) 
recipes["steps"] = recipes["steps"].str.replace("\[|\]","",regex=True).str.split(",",expand=True) 
recipes["ingredients"] = recipes["ingredients"].str.replace("\[|\]","",regex=True).str.split(",",expand=True) 
# recipes["nutrition"] = recipes["nutrition"].str[1:-1].str.split(', ')
recipes["submitted"] = pd.to_datetime(recipes["submitted"])

raw_recipes = recipes
# (calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV) , and carbohydrates (PDV))
# break up nutrition into different measurements
raw_recipes[
    ['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']
] = raw_recipes['nutrition'].str.replace("\[|\]","",regex=True).str.split(",",expand=True) 
raw_recipes[
    ['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']
] =  raw_recipes[
    ['calories', 'total fat', 'sugar', 'sodium', 'protein', 'saturated fat', 'carbohydrates']
].astype(float)


In [100]:
raw_recipes

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,rating,calories,total fat,sugar,sodium,protein,saturated fat,carbohydrates
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,5.000000,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,4.666667,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,4.000000,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,4.500000,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,5.000000,352.9,1.0,337.0,23.0,3.0,0.0,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,zydeco soup,486161,60,227978,2012-08-29,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,"['celery', 'onion', 'green sweet pepper', 'gar...",22,5.000000,415.2,26.0,34.0,26.0,44.0,21.0,15.0
231633,zydeco spice mix,493372,5,1500678,2013-01-09,"['15-minutes-or-less', 'time-to-make', 'course...","[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",1,['mix all ingredients together thoroughly'],this spice mix will make your taste buds dance!,"['paprika', 'salt', 'garlic powder', 'onion po...",13,5.000000,14.8,0.0,2.0,58.0,1.0,0.0,1.0
231634,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8,5.000000,59.2,6.0,2.0,3.0,6.0,5.0,0.0
231635,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,1.000000,188.0,11.0,57.0,11.0,7.0,21.0,9.0


In [102]:
def assess_missingness(data,focus_col,compare_col,stat="tvd",n_repetitions=1000):
    def tvd(data,missing_col,compare_col):
        pivoted = (
            shuffled
            .pivot_table(index=missing_col, columns=compare_col, aggfunc='size')
            .apply(lambda x: x / x.sum(), axis=1)
        )

        tvd = pivoted.diff().iloc[-1].abs().sum() / 2
        return tvd
    
    def ks_test(data,missing_col,compare_col):
        missing_data = data.loc[data[missing_col],compare_col]
        nonmissing_data = data.loc[~data[missing_col],compare_col]
        return ks_2samp(missing_data, nonmissing_data).pvalue
    
    shuffled = data.copy()
    missing_col = focus_col+'_missing'
    shuffled[missing_col] = shuffled[focus_col].isna()
    
    
    if stat == "ks":
        return ks_test(shuffled,missing_col,compare_col)
    
    if stat == "tvd":
        obs_tvd = tvd(data,missing_col,compare_col)
        tvds = []
        for _ in tqdm(range(n_repetitions)):

            # Shuffling genders and assigning back to the DataFrame
            shuffled[compare_col] = np.random.permutation(shuffled[compare_col])
            tvds.append(tvd(shuffled,missing_col,compare_col))
        pval = np.mean(np.array(tvds) >= obs_tvd)
        return pval

In [103]:
raw_recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients', 'rating', 'calories', 'total fat', 'sugar', 'sodium',
       'protein', 'saturated fat', 'carbohydrates'],
      dtype='object')

In [106]:
for col in raw_recipes.columns:
    if col != "rating" or col != "description":
        if raw_recipes[col].dtype == int or raw_recipes[col].dtype == float:
            print(col)
            print(assess_missingness(raw_recipes,"rating",col,stat="ks"))
            print(assess_missingness(raw_recipes,"description",col,stat="ks"))

id
4.552090006368089e-114
0.0
minutes
7.27677406374565e-24
2.3876263892488294e-75
contributor_id
8.99467336728369e-67
0.0
n_steps
7.2043963365857e-20
0.0002772193690374633
n_ingredients
0.007190498647391931
0.0009048506515904445
rating
0.0
1.5526464272308683e-35
calories
1.0226351856229951e-12
1.9039037380333315e-09
total fat
5.778009468681305e-11
1.291992921202802e-07
sugar
8.570102648362806e-11
4.4014661230176817e-07
sodium
0.013175027244109691
0.05919307931286144
protein
0.08853070963231202
0.018687566657613914
saturated fat
2.0581464338624003e-09
3.5677080863595074e-05
carbohydrates
1.743750392532296e-11
6.737880101242013e-10
