In [1]:
# Import all the packages that we need in this project
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.spatial import distance

In [2]:
#Import and adjust the Countvectorizer and TfidfVectorizer function
cv = CountVectorizer(analyzer='word',stop_words='english',ngram_range=(1,3),max_features=400)
tfidf = TfidfTransformer()

In [3]:
tf = TfidfVectorizer(analyzer='word',stop_words='english',ngram_range=(1,3),max_features=400)

In [4]:
#Import our training dataset and testing dataset
train = pd.read_csv('core-data-train_rating.csv')
test = pd.read_csv('test_10k_2.csv')

In [5]:
#Import the recipe csv file and try to make a connection with the train dataset and test dataset we already imported
#Then, we merge them based on the recipe id and make them into two new data set. 
recipe = pd.read_csv('core-data_recipe.csv')
recipe = recipe.drop('image_url',axis=1)

In [6]:
# Define a funcation to extract cooking time feature, and define as hours 
def cook_time(x):
    try:
        a = re.findall(r'\w+',x)[re.findall(r'\w+',x).index('nCook')+1:re.findall(r'\w+',x).index('nCook')+3]
        a1 = ''.join(a)[1:]
        if a1[-1] == 'm':
            a2 =int(a1[:-1])/60
            return(round(a2,2))
        else:
            return(int(a1[:-1])) 
    except ValueError:
        return (0)

In [7]:
# Define a funcation to extract prepare time feature,and define as hours 
def prep_time(x):
    try:
        b = re.findall(r'\w+',x)[re.findall(r'\w+',x).index('Prep')+1:re.findall(r'\w+',x).index('Prep')+3]
        b1 = (''.join(b)[1:])
        if b1[-1] == 'm':
            b2 =int(b1[:-1])/60
            return(round(b2,2))
        else:
            return(int(b1[:-1]))
    except IndexError:
        return(0)
    except ValueError:
        return (0)

In [8]:
# Define a funcation to extract ready time feature,and define as hours 
def ready_time(x):
    try:
        c = re.findall(r'\w+',x)[re.findall(r'\w+',x).index('nReady')+2:re.findall(r'\w+',x).index('nReady')+4]
        c1 = (''.join(c)[1:])
        if c1[-1] == 'm':
            c2 =int(c1[:-1])/60
            return(round(c2,2))
        else:
            return(int(c1[:-1]))
    except ValueError:
        return (0)

In [9]:
# Define a funcation to extract sugar feature
def sugar(x):
    try:
        d = re.search(r'Sugars\,\s\w+\:\s\w+',x)
        if d is not None: 
            sugar = d.group(0)
            sugar = sugar.replace("Sugars, amount: ", "")
            return int(sugar[-1])
        else:
            return 0 
    except ValueError:
        return (0)

In [10]:
# Define a funcation to extract calories feature
def calories(x):
    try:
        e = re.search(r'Calories\,\s\w+\:\s\w+',x)
        if e is not None:
            calories = e.group(0)
            calories = calories.replace("Calories, amount: ", "")
            return int(calories[-1])
        else: 
            return 0
    except ValueError:
        return (0)

In [11]:
# Define a funcation to extract fat feature
def fat(x):
    try:
        f = re.search(r'Fat\,\s\w+\:\s\w+',x)
        if f is not None:
            fat = f.group(0)
            fat = fat.replace("Fat, amount: ", "")
            return int(fat[-1])
        else:
            return 0
    except ValueError:
        return (0)

In [12]:
# Define a funcation to extract calfromfat feature
def calfromfat(x):
    try:
        g = re.search(r'Calories from Fat\,\s\w+\:\s\w+',x)
        if g is not None:
            calfromfat = g.group(0)
            calfromfat = calfromfat.replace("Calories from Fat, amount: ", "")
            return int(calfromfat[-1])
        else: 
            return 0
    except ValueError:
        return (0)

In [13]:
# Define a funcation to extract cholesterol feature
def cholesterol(x):
    try:
        h = re.search(r'Cholesterol\,\s\w+\:\s\w+',x)
        if h is not None:
            cholesterol = h.group(0)
            cholesterol = cholesterol.replace("Cholesterol, amount: ", "")
            return int(cholesterol[-1])
        else: 
            return 0
    except ValueError:
        return (0)

In [14]:
# Define a funcation to extract protein feature
def protein(x):
    try:
        j = re.search(r'Protein\,\s\w+\:\s\w+',x)
        if j is not None:
            protein = j.group(0)
            protein = protein.replace("Protein, amount: ", "")
            return int(protein[-1])
        else:
            return 0 
    except ValueError:
        return (0)

In [15]:

train1 = pd.merge(train,recipe,on='recipe_id')
train1['cook'] = train1['cooking_directions'].apply(cook_time)
train1['prep'] = train1['cooking_directions'].apply(prep_time)
train1['ready'] = train1['cooking_directions'].apply(ready_time)
train1['ingredients'] = train1['ingredients'].apply(lambda a:a.replace('^',' '))
train1['nutritions'] = train1['nutritions'].apply(lambda a:a.replace("'",""))

In [16]:
train1['Sugar'] = train1['nutritions'].apply(sugar)
train1['Calories'] = train1['nutritions'].apply(calories)
train1['Fat'] = train1['nutritions'].apply(fat)
train1['Calfromfat'] = train1['nutritions'].apply(calfromfat)
train1['Cholesterol'] = train1['nutritions'].apply(cholesterol)
train1['Protein'] = train1['nutritions'].apply(protein)

In [17]:
train1['Chicken'] = [i.count("Chicken") for i in train1['recipe_name']]

In [18]:
train1['Chocolate'] = [i.count("Chocolate") for i in train1['recipe_name']]
train1['Soup'] = [i.count("Soup") for i in train1['recipe_name']]
train1['Cookies'] = [i.count("Cookies") for i in train1['recipe_name']]
train1['Salad'] = [i.count("Salad") for i in train1['recipe_name']]
train1['Cake'] = [i.count("Cake") for i in train1['recipe_name']]
train1['Bread'] = [i.count("Bread") for i in train1['recipe_name']]
train1['Easy'] = [i.count("Easy") for i in train1['recipe_name']]

In [19]:
train1['salt'] = [i.count("salt") for i in train1['ingredients']]
train1['sugar'] = [i.count("sugar") for i in train1['ingredients']]
train1['pepper'] = [i.count("pepper") for i in train1['ingredients']]
train1['cheese'] = [i.count("cheese") for i in train1['ingredients']]
train1['flour'] = [i.count("flour") for i in train1['ingredients']]
train1['garlic'] = [i.count("garlic") for i in train1['ingredients']]

In [20]:
train1.tail()

Unnamed: 0,user_id,recipe_id,rating,dateLastModified,recipe_name,ingredients,cooking_directions,nutritions,cook,prep,...,Salad,Cake,Bread,Easy,salt,sugar,pepper,cheese,flour,garlic
283435,2679742,234369,5,2014-10-11T15:49:22.99\n,Baghdad Beef Stew,vegetable oil cubed beef stew meat figs draine...,{'directions': u'Prep\n20 m\nCook\n2 h 50 m\nR...,"{uniacin: {uhasCompleteData: False, uname: uNi...",2.0,0.33,...,0,0,0,0,1,0,1,0,0,0
283436,1223518,29907,5,2014-11-08T19:49:07.457\n,Sausage Dressing,fresh cranberries sausage bread cubes seasoned...,{'directions': u'Prep\n30 m\nCook\n25 m\nReady...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.42,0.5,...,0,0,0,0,1,0,1,0,0,1
283437,6582644,12886,5,2013-02-28T15:49:26.757\n,Cream of Broccoli Cheese Soup I,chopped onion margarine chicken broth wide egg...,{'directions': u'Prep\n5 m\nCook\n25 m\nReady ...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.42,0.08,...,0,0,0,0,0,0,0,1,0,1
283438,6582644,90027,5,2013-05-02T07:43:10.08\n,Tamra's Microwave Tilapia,tilapia fillets butter garlic clove chopped fr...,{'directions': u'Prep\n10 m\nCook\n4 m\nReady ...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.07,0.17,...,0,0,0,0,0,0,0,0,0,1
283439,2430357,245659,4,2017-03-08T00:15:43.077\n,Nacho-Flavored Roasted Chickpeas,chickpeas drained and rinsed nutritional yeast...,{'directions': u'Prep\n5 m\nCook\n30 m\nReady ...,"{uniacin: {uhasCompleteData: True, uname: uNia...",0.5,0.08,...,0,0,0,0,1,0,0,0,0,0


In [21]:
t = cv.fit(train1['ingredients']).transform(train1['ingredients'])
T= tfidf.fit(t).transform(t)

In [22]:
train1['tfidf'] = T
train1.head()

Unnamed: 0,user_id,recipe_id,rating,dateLastModified,recipe_name,ingredients,cooking_directions,nutritions,cook,prep,...,Cake,Bread,Easy,salt,sugar,pepper,cheese,flour,garlic,tfidf
0,5215572,55090,5,2015-01-09T18:05:22.95\n,Creamy Vanilla Fruit Dip,cream cheese confectioners' sugar vanilla extr...,{'directions': u'Prep\n10 m\nReady In\n10 m\nC...,"{uniacin: {uhasCompleteData: True, uname: uNia...",0.0,0.17,...,0,0,0,0,1,0,1,0,0,"(0, 369)\t0.22419008488206987\n (0, 368)\t0..."
1,41321,55090,5,2017-12-02T18:14:16.66\n,Creamy Vanilla Fruit Dip,cream cheese confectioners' sugar vanilla extr...,{'directions': u'Prep\n10 m\nReady In\n10 m\nC...,"{uniacin: {uhasCompleteData: True, uname: uNia...",0.0,0.17,...,0,0,0,0,1,0,1,0,0,"(0, 369)\t0.22419008488206987\n (0, 368)\t0..."
2,5215572,26317,4,2016-12-04T17:50:35.777\n,Chicken Pot Pie IX,skinless boneless chicken breast halves sliced...,{'directions': u'Prep\n20 m\nCook\n50 m\nReady...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.83,0.33,...,0,0,0,1,0,1,0,1,0,"(0, 369)\t0.22419008488206987\n (0, 368)\t0..."
3,3622615,26317,5,2013-06-14T17:39:06.547\n,Chicken Pot Pie IX,skinless boneless chicken breast halves sliced...,{'directions': u'Prep\n20 m\nCook\n50 m\nReady...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.83,0.33,...,0,0,0,1,0,1,0,1,0,"(0, 369)\t0.22419008488206987\n (0, 368)\t0..."
4,3276442,26317,5,2015-03-19T15:51:12.907\n,Chicken Pot Pie IX,skinless boneless chicken breast halves sliced...,{'directions': u'Prep\n20 m\nCook\n50 m\nReady...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.83,0.33,...,0,0,0,1,0,1,0,1,0,"(0, 369)\t0.22419008488206987\n (0, 368)\t0..."


In [23]:
tra = train1[['user_id','recipe_id','rating','tfidf']]

In [24]:
import string
list = []
for i in train1['ingredients']:
    l = i.split(" ")
    list.append(l)    

In [25]:
flat_list = []
for sublist in list:
    for item in sublist:
        flat_list.append(item)

In [26]:
from collections import Counter
c = Counter(flat_list)
c.most_common(30)

[('salt', 189491),
 ('ground', 178523),
 ('pepper', 175980),
 ('sugar', 147545),
 ('chopped', 124059),
 ('white', 123508),
 ('butter', 118077),
 ('oil', 107670),
 ('garlic', 102063),
 ('flour', 97361),
 ('cheese', 96141),
 ('onion', 96023),
 ('black', 90596),
 ('powder', 87158),
 ('fresh', 85392),
 ('all-purpose', 84203),
 ('and', 80782),
 ('chicken', 71261),
 ('sauce', 68919),
 ('to', 65719),
 ('water', 63914),
 ('dried', 63605),
 ('taste', 63588),
 ('baking', 62012),
 ('milk', 60022),
 ('cream', 59869),
 ('eggs', 53850),
 ('olive', 53174),
 ('extract', 47471),
 ('vanilla', 47132)]

In [27]:
import string
list1 = []
for i in train1['recipe_name']:
    l1 = i.split(" ")
    list1.append(l1) 

In [28]:
flat_list1 = []
for sublist in list1:
    for item in sublist:
        flat_list1.append(item)

In [29]:
from collections import Counter
c1 = Counter(flat_list1)
c1.most_common(20)

[('Chicken', 36288),
 ('and', 31298),
 ('with', 15414),
 ('Soup', 14188),
 ('Salad', 12647),
 ('Cooker', 10830),
 ('Slow', 10794),
 ('Bread', 10731),
 ('Chocolate', 10486),
 ('Cake', 10383),
 ('I', 10077),
 ('Cookies', 10076),
 ('Easy', 9947),
 ('Pie', 9423),
 ('II', 9365),
 ('Pork', 9337),
 ('Baked', 8840),
 ('Best', 8729),
 ('Sauce', 8372),
 ('Casserole', 8304)]

In [30]:
train2 = train1[['user_id','recipe_id','rating','ingredients','cook','prep','ready','Sugar','Calories',
                'Fat','Calfromfat','Cholesterol','Protein', 'Chicken','Chocolate','Soup','Cookies','Salad',
                 'Cake','Bread','Easy','salt','sugar','pepper','cheese','flour','garlic']]

In [31]:
test1 = pd.merge(test,recipe,on='recipe_id')
test1['cook'] = test1['cooking_directions'].apply(cook_time)
test1['prep'] = test1['cooking_directions'].apply(prep_time)
test1['ready'] = test1['cooking_directions'].apply(ready_time)
test1['ingredients'] = test1['ingredients'].apply(lambda a:a.replace('^',' '))
test1['nutritions'] = test1['nutritions'].apply(lambda a:a.replace("'",""))

In [32]:
test1['Sugar'] = test1['nutritions'].apply(sugar)
test1['Calories'] = test1['nutritions'].apply(calories)
test1['Fat'] = test1['nutritions'].apply(fat)
test1['Calfromfat'] = test1['nutritions'].apply(calfromfat)
test1['Cholesterol'] = test1['nutritions'].apply(cholesterol)
test1['Protein'] = test1['nutritions'].apply(protein)

In [33]:
test1['Chicken'] = [i.count("Chicken") for i in test1['recipe_name']]
test1['Chocolate'] = [i.count("Chocolate") for i in test1['recipe_name']]
test1['Soup'] = [i.count("Soup") for i in test1['recipe_name']]
test1['Cookies'] = [i.count("Cookies") for i in test1['recipe_name']]
test1['Salad'] = [i.count("Salad") for i in test1['recipe_name']]
test1['Cake'] = [i.count("Cake") for i in test1['recipe_name']]
test1['Bread'] = [i.count("Bread") for i in test1['recipe_name']]
test1['Easy'] = [i.count("Easy") for i in test1['recipe_name']]

In [34]:
test1['salt'] = [i.count("salt") for i in test1['ingredients']]
test1['sugar'] = [i.count("sugar") for i in test1['ingredients']]
test1['pepper'] = [i.count("pepper") for i in test1['ingredients']]
test1['cheese'] = [i.count("cheese") for i in test1['ingredients']]
test1['flour'] = [i.count("flour") for i in test1['ingredients']]
test1['garlic'] = [i.count("garlic") for i in test1['ingredients']]

In [35]:
s = cv.fit(test1['ingredients']).transform(test1['ingredients'])
S= tfidf.fit(s).transform(s)

In [36]:
test1['tfidf'] = S
test1['rating'] = 5
test1.head()

Unnamed: 0,user_id,recipe_id,recipe_name,ingredients,cooking_directions,nutritions,cook,prep,ready,Sugar,...,Bread,Easy,salt,sugar,pepper,cheese,flour,garlic,tfidf,rating
0,2554211,87507,Corn and Crab Bisque,butter onion chicken broth garlic bay leaves c...,{'directions': u'Prep\n15 m\nCook\n30 m\nReady...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.5,0.25,0.75,4,...,0,0,1,0,2,0,1,1,"(0, 350)\t0.14842803595677656\n (0, 305)\t0...",5
1,2942251,84214,Blender Hollandaise Sauce,egg yolks Dijon mustard lemon juice hot pepper...,{'directions': u'Prep\n5 m\nReady In\n5 m\nIn ...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.0,0.08,0.08,0,...,0,0,0,0,1,0,0,0,"(0, 350)\t0.14842803595677656\n (0, 305)\t0...",5
2,5670169,84214,Blender Hollandaise Sauce,egg yolks Dijon mustard lemon juice hot pepper...,{'directions': u'Prep\n5 m\nReady In\n5 m\nIn ...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.0,0.08,0.08,0,...,0,0,0,0,1,0,0,0,"(0, 350)\t0.14842803595677656\n (0, 305)\t0...",5
3,825478,6817,New York Rye Bread,warm water vegetable oil honey salt caraway se...,"{'directions': u""Prep\n5 m\nCook\n3 h\nReady I...","{uniacin: {uhasCompleteData: False, uname: uNi...",3.0,0.08,3.0,4,...,1,0,1,0,0,0,2,0,"(0, 350)\t0.14842803595677656\n (0, 305)\t0...",5
4,1504758,47374,Easy Chicken Enchiladas,cream cheese salsa chopped cooked chicken brea...,{'directions': u'Prep\n20 m\nCook\n30 m\nReady...,"{uniacin: {uhasCompleteData: False, uname: uNi...",0.5,0.33,0.83,2,...,0,1,0,0,0,2,1,0,"(0, 350)\t0.14842803595677656\n (0, 305)\t0...",5


In [37]:
tes = test1[['user_id','recipe_id','rating','tfidf']]

In [38]:
test2 = test1[['user_id','recipe_id','rating','ingredients','cook','prep','ready','Sugar','Calories',
                'Fat','Calfromfat','Cholesterol','Protein', 'Chicken','Chocolate','Soup','Cookies','Salad',
                 'Cake','Bread','Easy','salt','sugar','pepper','cheese','flour','garlic']]

In [39]:
test2.head()

Unnamed: 0,user_id,recipe_id,rating,ingredients,cook,prep,ready,Sugar,Calories,Fat,...,Salad,Cake,Bread,Easy,salt,sugar,pepper,cheese,flour,garlic
0,2554211,87507,5,butter onion chicken broth garlic bay leaves c...,0.5,0.25,0.75,4,8,6,...,0,0,0,0,1,0,2,0,1,1
1,2942251,84214,5,egg yolks Dijon mustard lemon juice hot pepper...,0.0,0.08,0.08,0,2,7,...,0,0,0,0,0,0,1,0,0,0
2,5670169,84214,5,egg yolks Dijon mustard lemon juice hot pepper...,0.0,0.08,0.08,0,2,7,...,0,0,0,0,0,0,1,0,0,0
3,825478,6817,5,warm water vegetable oil honey salt caraway se...,3.0,0.08,3.0,4,6,2,...,0,0,1,0,1,0,0,0,2,0
4,1504758,47374,5,cream cheese salsa chopped cooked chicken brea...,0.5,0.33,0.83,2,4,4,...,0,0,0,1,0,0,0,2,1,0


In [40]:
l = []
for i in range(len(test2)):
    a = (test2.cook.iloc[i], test2.prep.iloc[i], test2.ready.iloc[i],test2['Sugar'].iloc[i],test2['Calories'].iloc[i],
        test2['Fat'].iloc[i],test2['Calfromfat'].iloc[i],test2['Cholesterol'].iloc[i],test2['Protein'].iloc[i],
         test2['Chicken'].iloc[i],test2['Chocolate'].iloc[i],test2['Soup'].iloc[i],test2['Cookies'].iloc[i],
         test2['Salad'].iloc[i],test2['Cake'].iloc[i],test2['Bread'].iloc[i],test2['Easy'].iloc[i],
         test2['salt'].iloc[i],test2['sugar'].iloc[i],test2['pepper'].iloc[i],test2['cheese'].iloc[i],
         test2['flour'].iloc[i],test2['garlic'].iloc[i] )
    l.append(a)

In [41]:
tr = []
for i in range(len(train2)):
    a = (train2.cook.iloc[i], train2.prep.iloc[i], train2.ready.iloc[i],train2['Sugar'].iloc[i],train2['Calories'].iloc[i],
        train2['Fat'].iloc[i],train2['Calfromfat'].iloc[i],train2['Cholesterol'].iloc[i],train2['Protein'].iloc[i],
         train2['Chicken'].iloc[i],train2['Chocolate'].iloc[i],train2['Soup'].iloc[i],train2['Cookies'].iloc[i],
         train2['Salad'].iloc[i],train2['Cake'].iloc[i],train2['Bread'].iloc[i],train2['Easy'].iloc[i], 
         train2['salt'].iloc[i],train2['sugar'].iloc[i],train2['pepper'].iloc[i],train2['cheese'].iloc[i],
         train2['flour'].iloc[i],train2['garlic'].iloc[i])
    tr.append(a)

In [42]:
test2['Array'] = l
test2['Array'] = test2['Array'].apply(np.array)
test2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,user_id,recipe_id,rating,ingredients,cook,prep,ready,Sugar,Calories,Fat,...,Cake,Bread,Easy,salt,sugar,pepper,cheese,flour,garlic,Array
0,2554211,87507,5,butter onion chicken broth garlic bay leaves c...,0.5,0.25,0.75,4,8,6,...,0,0,0,1,0,2,0,1,1,"[0.5, 0.25, 0.75, 4.0, 8.0, 6.0, 6.0, 8.0, 4.0..."
1,2942251,84214,5,egg yolks Dijon mustard lemon juice hot pepper...,0.0,0.08,0.08,0,2,7,...,0,0,0,0,0,1,0,0,0,"[0.0, 0.08, 0.08, 0.0, 2.0, 7.0, 7.0, 3.0, 1.0..."
2,5670169,84214,5,egg yolks Dijon mustard lemon juice hot pepper...,0.0,0.08,0.08,0,2,7,...,0,0,0,0,0,1,0,0,0,"[0.0, 0.08, 0.08, 0.0, 2.0, 7.0, 7.0, 3.0, 1.0..."
3,825478,6817,5,warm water vegetable oil honey salt caraway se...,3.0,0.08,3.0,4,6,2,...,0,1,0,1,0,0,0,2,0,"[3.0, 0.08, 3.0, 4.0, 6.0, 2.0, 0.0, 0.0, 7.0,..."
4,1504758,47374,5,cream cheese salsa chopped cooked chicken brea...,0.5,0.33,0.83,2,4,4,...,0,0,1,0,0,0,2,1,0,"[0.5, 0.33, 0.83, 2.0, 4.0, 4.0, 7.0, 9.0, 2.0..."


In [43]:
train2['Array'] = tr
train2['Array'] = train2['Array'].apply(np.array)
train2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,user_id,recipe_id,rating,ingredients,cook,prep,ready,Sugar,Calories,Fat,...,Cake,Bread,Easy,salt,sugar,pepper,cheese,flour,garlic,Array
0,5215572,55090,5,cream cheese confectioners' sugar vanilla extr...,0.0,0.17,0.17,5,7,6,...,0,0,0,0,1,0,1,0,0,"[0.0, 0.17, 0.17, 5.0, 7.0, 6.0, 1.0, 0.0, 0.0..."
1,41321,55090,5,cream cheese confectioners' sugar vanilla extr...,0.0,0.17,0.17,5,7,6,...,0,0,0,0,1,0,1,0,0,"[0.0, 0.17, 0.17, 5.0, 7.0, 6.0, 1.0, 0.0, 0.0..."
2,5215572,26317,4,skinless boneless chicken breast halves sliced...,0.83,0.33,1.0,3,5,4,...,0,0,0,1,0,1,0,1,0,"[0.83, 0.33, 1.0, 3.0, 5.0, 4.0, 6.0, 5.0, 8.0..."
3,3622615,26317,5,skinless boneless chicken breast halves sliced...,0.83,0.33,1.0,3,5,4,...,0,0,0,1,0,1,0,1,0,"[0.83, 0.33, 1.0, 3.0, 5.0, 4.0, 6.0, 5.0, 8.0..."
4,3276442,26317,5,skinless boneless chicken breast halves sliced...,0.83,0.33,1.0,3,5,4,...,0,0,0,1,0,1,0,1,0,"[0.83, 0.33, 1.0, 3.0, 5.0, 4.0, 6.0, 5.0, 8.0..."


In [44]:
test3 = test2

In [45]:
test3

Unnamed: 0,user_id,recipe_id,rating,ingredients,cook,prep,ready,Sugar,Calories,Fat,...,Cake,Bread,Easy,salt,sugar,pepper,cheese,flour,garlic,Array
0,2554211,87507,5,butter onion chicken broth garlic bay leaves c...,0.50,0.25,0.75,4,8,6,...,0,0,0,1,0,2,0,1,1,"[0.5, 0.25, 0.75, 4.0, 8.0, 6.0, 6.0, 8.0, 4.0..."
1,2942251,84214,5,egg yolks Dijon mustard lemon juice hot pepper...,0.00,0.08,0.08,0,2,7,...,0,0,0,0,0,1,0,0,0,"[0.0, 0.08, 0.08, 0.0, 2.0, 7.0, 7.0, 3.0, 1.0..."
2,5670169,84214,5,egg yolks Dijon mustard lemon juice hot pepper...,0.00,0.08,0.08,0,2,7,...,0,0,0,0,0,1,0,0,0,"[0.0, 0.08, 0.08, 0.0, 2.0, 7.0, 7.0, 3.0, 1.0..."
3,825478,6817,5,warm water vegetable oil honey salt caraway se...,3.00,0.08,3.00,4,6,2,...,0,1,0,1,0,0,0,2,0,"[3.0, 0.08, 3.0, 4.0, 6.0, 2.0, 0.0, 0.0, 7.0,..."
4,1504758,47374,5,cream cheese salsa chopped cooked chicken brea...,0.50,0.33,0.83,2,4,4,...,0,0,1,0,0,0,2,1,0,"[0.5, 0.33, 0.83, 2.0, 4.0, 4.0, 7.0, 9.0, 2.0..."
5,6751689,47374,5,cream cheese salsa chopped cooked chicken brea...,0.50,0.33,0.83,2,4,4,...,0,0,1,0,0,0,2,1,0,"[0.5, 0.33, 0.83, 2.0, 4.0, 4.0, 7.0, 9.0, 2.0..."
6,2393124,47374,5,cream cheese salsa chopped cooked chicken brea...,0.50,0.33,0.83,2,4,4,...,0,0,1,0,0,0,2,1,0,"[0.5, 0.33, 0.83, 2.0, 4.0, 4.0, 7.0, 9.0, 2.0..."
7,3279571,56391,5,bratwurst links browned and cut into 1/2 inch ...,3.00,0.25,3.00,3,3,0,...,0,0,0,0,0,1,1,0,0,"[3.0, 0.25, 3.0, 3.0, 3.0, 0.0, 3.0, 2.0, 4.0,..."
8,1385526,56927,5,peeled and diced potatoes diced celery finely ...,0.42,0.33,0.75,3,4,0,...,0,0,0,1,0,0,0,1,0,"[0.42, 0.33, 0.75, 3.0, 4.0, 0.0, 4.0, 9.0, 6...."
9,4044589,56927,5,peeled and diced potatoes diced celery finely ...,0.42,0.33,0.75,3,4,0,...,0,0,0,1,0,0,0,1,0,"[0.42, 0.33, 0.75, 3.0, 4.0, 0.0, 4.0, 9.0, 6...."


In [46]:
from tqdm import tqdm

In [47]:
from scipy.spatial import distance

In [48]:
# for this part, we first find out a user with certain recipe id in test csv, and extract features of this user. Then, 
# we find out the same user in train data set and all of the recipe array from this user. After that, we calculate the
# distance between the only array in test dataset and each array in train data set with the same userid. Finally, we 
# extract the array with minimun distance and give the rating from this array to the test data.
Rating = []

for i in tqdm(range(len(test3))):
    user = test3['user_id'].iloc[i] #find out the test users
    array1 = test3['Array'].iloc[i] #find out the test users' array
    train_user = train2[train2['user_id']==user] #find out the users in train dataset with the same userid
    d = []
    for x in range(len(train_user)):
        array2 = train_user['Array'].iloc[x] # find out each array under certain user-id
        distance1 = distance.euclidean(array1,array2) #calculate the euclidean distance between them
        d.append(distance1)
    train_user['distance'] = d
    a = train_user[train_user['distance'] == min(train_user['distance'])]['rating'].iloc[0] # Extract the minimum one and rating
    Rating.append(a)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
100%|██████████| 10000/10000 [57:13<00:00,  2.67it/s]   


In [49]:
test3['p_rating'] = Rating

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [50]:
# Calculate the MAE
mean = sum(test3['rating']) / len(test3)
test3['difference']=test3['p_rating'].apply(lambda x: abs(x-mean))

sum(test3['difference'])/len(test3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0.4929

In [51]:
# Export the dataframe into csv file
test4 = test3[['user_id','recipe_id','p_rating']]

test4["user_rep_id"] = test4['user_id'].apply(str) + test4['recipe_id'].apply(str)

ktest = test4[['user_rep_id','p_rating']]

ktest.rename(columns={'p_rating':"rating"},inplace = True)

ktest.to_csv('ktest3.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
