In [57]:
import pandas as pd
import numpy as np

In [58]:
BMJ_data_all__b_new = pd.read_csv('../input/BMJ-data-all--b-new.csv', sep='\t')
item_profiles1 = pd.read_csv('../input/item-profiles1.csv', sep=';')
item_profiles2 = pd.read_csv('../input/item-profiles2.csv', sep=';')
item_profiles3 = pd.read_csv('../input/item-profiles3.csv', sep=';')
ratings = pd.read_csv('../input/user-item-rating.csv', sep='\t', names=['user_id','item_id','rating'])

In [87]:
user_rating_count = ratings['user_id'].value_counts(normalize=False, sort=True, ascending=True)
user_rating_count

18066     20
37924     20
51419     20
57153     20
73891     20
        ... 
48719    236
78714    243
25694    248
76151    293
10088    313
Name: user_id, Length: 1273, dtype: int64

In [88]:
item_rating_count = ratings['item_id'].value_counts(normalize=False, sort=True, ascending=True)
item_rating_count

346      20
1924     20
2957     20
4445     20
1171     20
       ... 
134     290
140     291
50      308
137     321
58      358
Name: item_id, Length: 1031, dtype: int64

In [86]:
ratings

Unnamed: 0,user_id,item_id,rating
0,455,50,3.0
1,455,457,4.0
2,455,28,5.0
3,455,458,3.0
4,455,459,5.0
...,...,...,...
50676,84839,131,3.0
50677,84839,109,5.0
50678,84839,145,5.0
50679,84839,133,5.0


In [61]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [62]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [63]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['item_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [64]:
#Define the baseline model to always return 3.
def baseline(user_id, item_id):
    return 3.0

In [65]:
score(baseline)

1.6191040450231147

In [85]:
#Import the required classes and methods from the surprise library
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)

In [67]:
#Import SVD
from surprise import SVD

#Define the SVD algorithm object
svd = SVD()

#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE'])

{'test_rmse': array([0.76902932, 0.77222682, 0.77527975, 0.77991108, 0.76695119]),
 'fit_time': (1.816852331161499,
  1.8984355926513672,
  1.8479726314544678,
  1.7949259281158447,
  1.7876570224761963),
 'test_time': (0.08915090560913086,
  0.0708315372467041,
  0.05485343933105469,
  0.08840513229370117,
  0.05736279487609863)}

In [68]:
def healthiness(itemsDataframe):
    
    fat = ((itemsDataframe['Fat (g)'] * 9) / itemsDataframe['Calories (kCal)']) * 100
    sugar = ((itemsDataframe['Sugar (g)'] * 4) / itemsDataframe['Calories (kCal)']) * 100
    saturatedFat = ((itemsDataframe['Saturated Fat (g)'] * 9) / itemsDataframe['Calories (kCal)']) * 100
    
    itemsDataframe['Unhealtiness'] = fat + sugar + saturatedFat
    
    itemsDataframe.loc[fat > 30, 'fatPoints'] = 0
    itemsDataframe.loc[fat <= 30, 'fatPoints'] = 1 
    itemsDataframe.loc[fat <= 15, 'fatPoints'] = 2
    itemsDataframe.loc[fat <= 3, 'fatPoints'] = 3 

    itemsDataframe.loc[saturatedFat > 10, 'satFatPoints'] = 0
    itemsDataframe.loc[saturatedFat <= 10, 'satFatPoints'] = 1 
    itemsDataframe.loc[saturatedFat <= 5, 'satFatPoints'] = 2
    itemsDataframe.loc[saturatedFat <= 1, 'satFatPoints'] = 3 

        
    itemsDataframe.loc[sugar > 10, 'sugarPoints'] = 0
    itemsDataframe.loc[sugar <= 10, 'sugarPoints'] = 1 
    itemsDataframe.loc[sugar <= 5, 'sugarPoints'] = 2
    itemsDataframe.loc[sugar <= 1, 'sugarPoints'] = 3 
        
       
    itemsDataframe['Healthiness'] = itemsDataframe['fatPoints'] + itemsDataframe['satFatPoints'] + itemsDataframe['sugarPoints']

In [69]:
healthiness(BMJ_data_all__b_new)
healthiness(item_profiles2)

In [153]:
item_profiles = item_profiles1.set_index('Recipe ID').join(item_profiles2.set_index('Recipe ID')).join(item_profiles3.set_index('Recipe ID'))

In [154]:
item_profiles

Unnamed: 0_level_0,Category,Directions,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),...,Year of Publishing,Unhealtiness,fatPoints,satFatPoints,sugarPoints,Healthiness,Ingredient ID,Amount (g),Quantity,Ingredient
Recipe ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,Home > Recipes > Main Dish,"In a medium bowl, mix teriyaki sauce, red wine...",Fiery Pork Skewers,0.1,0.39,3.0,6.5,18.1,2.4,1.5,...,2001,55.510204,0.0,1.0,1.0,2.0,5547,36.000000,2 tablespoons,teriyaki sauce
4,Home > Recipes > Main Dish,"In a medium bowl, mix teriyaki sauce, red wine...",Fiery Pork Skewers,0.1,0.39,3.0,6.5,18.1,2.4,1.5,...,2001,55.510204,0.0,1.0,1.0,2.0,18888,16.071430,1 tablespoon,red wine vinegar
4,Home > Recipes > Main Dish,"In a medium bowl, mix teriyaki sauce, red wine...",Fiery Pork Skewers,0.1,0.39,3.0,6.5,18.1,2.4,1.5,...,2001,55.510204,0.0,1.0,1.0,2.0,6305,13.625000,1 tablespoon,vegetable oil
4,Home > Recipes > Main Dish,"In a medium bowl, mix teriyaki sauce, red wine...",Fiery Pork Skewers,0.1,0.39,3.0,6.5,18.1,2.4,1.5,...,2001,55.510204,0.0,1.0,1.0,2.0,1525,4.621849,1 teaspoon,brown sugar
4,Home > Recipes > Main Dish,"In a medium bowl, mix teriyaki sauce, red wine...",Fiery Pork Skewers,0.1,0.39,3.0,6.5,18.1,2.4,1.5,...,2001,55.510204,0.0,1.0,1.0,2.0,20244,1.500000,1/2 teaspoon,red pepper flakes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15746,Home > Recipes > Main Dish > Stuffed Mai...,Preheat an oven to 350 degrees F (175 degrees ...,Roast Beef Stuffed Shells,2.0,1.90,42.3,28.2,33.6,1.6,13.7,...,2009,68.974820,0.0,0.0,2.0,2.0,20717,336.000000,12 ounces,jumbo pasta shells
15746,Home > Recipes > Main Dish > Stuffed Mai...,Preheat an oven to 350 degrees F (175 degrees ...,Roast Beef Stuffed Shells,2.0,1.90,42.3,28.2,33.6,1.6,13.7,...,2009,68.974820,0.0,0.0,2.0,2.0,9358,681.000000,1 1/2 pounds,thinly sliced deli roast beef
15746,Home > Recipes > Main Dish > Stuffed Mai...,Preheat an oven to 350 degrees F (175 degrees ...,Roast Beef Stuffed Shells,2.0,1.90,42.3,28.2,33.6,1.6,13.7,...,2009,68.974820,0.0,0.0,2.0,2.0,7407,1160.000000,4 (10.25 ounce) cans,beef gravy
15746,Home > Recipes > Main Dish > Stuffed Mai...,Preheat an oven to 350 degrees F (175 degrees ...,Roast Beef Stuffed Shells,2.0,1.90,42.3,28.2,33.6,1.6,13.7,...,2009,68.974820,0.0,0.0,2.0,2.0,7427,283.500000,1 (10 ounce) bottle,cream-style horseradish sauce


In [121]:
merged = pd.concat([BMJ_data_all__b_new, item_profiles2])
merged.head(2)

Unnamed: 0,URL,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),Size (g),...,Average Sentiment,Number of Ratings,Number of Bookmarks,Year of Publishing,Unhealtiness,fatPoints,satFatPoints,sugarPoints,Healthiness,Recipe ID
0,http://allrecipes.com/recipe/frosted-apricot-c...,Frosted Apricot Cookies,0.7,0.2,24.6,11.2,2.2,11.1,7.3,610.2,...,1.5,2,14,2004,103.891626,0.0,0.0,0.0,0.0,
1,http://allrecipes.com/recipe/italian-nachos-re...,Italian Nachos Restaurant-Style,2.2,1.13,22.5,31.7,19.3,1.9,11.1,1305.7,...,2.0,11,195,2008,86.710817,0.0,0.0,2.0,2.0,


In [130]:
item_profiles2

Unnamed: 0,Recipe ID,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),Size (g),...,Average Rating,Average Sentiment,Number of Ratings,Number of Bookmarks,Year of Publishing,Unhealtiness,fatPoints,satFatPoints,sugarPoints,Healthiness
0,2622,Slow Cooker Tender and Yummy Round Steak,4.5,0.83,33.1,13.6,33.8,5.0,4.6,2599.35,...,4.32,1.79,81,2271,2000,46.768448,0.0,0.0,1.0,1.0
1,722,Chicken Pot Pie II,6.2,1.06,47.8,29.5,51.4,6.4,11.4,2137.86,...,4.66,2.02,116,1200,2000,59.114114,0.0,0.0,2.0,2.0
2,1137,Chicken in a Pot,1.0,0.40,6.9,6.6,28.7,1.8,1.4,819.37,...,4.29,1.98,83,1779,2001,38.446602,1.0,1.0,2.0,4.0
3,2502,Erin's Indonesian Chicken,6.4,0.32,58.1,18.6,35.4,7.8,3.8,1972.13,...,4.39,1.99,80,872,2005,43.924528,0.0,1.0,1.0,2.0
4,2714,Bubble Pizza,3.0,1.96,45.4,36.4,28.5,8.7,13.4,2375.00,...,4.35,1.96,117,2204,2001,77.403846,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,1353,Beef and Biscuit,0.9,0.63,15.5,18.9,18.4,3.7,8.6,1536.45,...,3.99,1.52,79,811,2001,85.718954,0.0,0.0,2.0,2.0
1027,987,Creamy Pesto Shrimp,2.7,0.44,43.0,42.5,23.1,0.2,24.3,1655.88,...,4.58,1.98,225,3134,2000,93.188854,0.0,0.0,3.0,3.0
1028,2903,Boiled Chicken,1.2,0.06,4.5,11.1,16.3,1.9,3.0,1780.00,...,4.70,1.20,74,1083,2001,72.311828,0.0,0.0,2.0,2.0
1029,2136,Mushroom Sauce Baked Pork Chops,2.5,1.11,28.1,14.3,19.0,6.5,6.1,2241.75,...,4.45,1.86,121,1188,2001,66.329114,0.0,0.0,1.0,1.0


In [132]:
duplicate_count = item_profiles3['Recipe ID'].value_counts(normalize=False, sort=True, ascending=True)
duplicate_count

53       2
8265     2
3800     2
864      2
1224     2
        ..
481     26
3399    28
3923    29
93      29
3242    30
Name: Recipe ID, Length: 1031, dtype: int64

In [70]:
BMJ_data_all__b_new.head(2)

Unnamed: 0,URL,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),Size (g),...,Average Rating,Average Sentiment,Number of Ratings,Number of Bookmarks,Year of Publishing,Unhealtiness,fatPoints,satFatPoints,sugarPoints,Healthiness
0,http://allrecipes.com/recipe/frosted-apricot-c...,Frosted Apricot Cookies,0.7,0.2,24.6,11.2,2.2,11.1,7.3,610.2,...,3.0,1.5,2,14,2004,103.891626,0.0,0.0,0.0,0.0
1,http://allrecipes.com/recipe/italian-nachos-re...,Italian Nachos Restaurant-Style,2.2,1.13,22.5,31.7,19.3,1.9,11.1,1305.7,...,4.73,2.0,11,195,2008,86.710817,0.0,0.0,2.0,2.0


In [102]:
item_profiles2.head(2)

Unnamed: 0,Recipe ID,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),Size (g),...,Average Rating,Average Sentiment,Number of Ratings,Number of Bookmarks,Year of Publishing,Unhealtiness,fatPoints,satFatPoints,sugarPoints,Healthiness
0,2622,Slow Cooker Tender and Yummy Round Steak,4.5,0.83,33.1,13.6,33.8,5.0,4.6,2599.35,...,4.32,1.79,81,2271,2000,46.768448,0.0,0.0,1.0,1.0
1,722,Chicken Pot Pie II,6.2,1.06,47.8,29.5,51.4,6.4,11.4,2137.86,...,4.66,2.02,116,1200,2000,59.114114,0.0,0.0,2.0,2.0


In [80]:
Healthiness_rating_count = BMJ_data_all__b_new['Healthiness'].value_counts(normalize=False, sort=True, ascending=True)
Healthiness_rating_count

9.0       78
8.0      112
7.0      232
5.0     1979
6.0     2698
4.0     3611
3.0     7608
1.0    11487
2.0    14554
0.0    15896
Name: Healthiness, dtype: int64

In [82]:
BMJ_data_all__b_healthy = BMJ_data_all__b_new[BMJ_data_all__b_new.Healthiness != 0]
Healthiness_rating_count_2 = BMJ_data_all__b_healthy['Healthiness'].value_counts(normalize=False, sort=True, ascending=True)
Healthiness_rating_count_2

9.0       78
8.0      112
7.0      232
5.0     1979
6.0     2698
4.0     3611
3.0     7608
1.0    11487
2.0    14554
Name: Healthiness, dtype: int64

In [140]:
item_profiles2_healthy = item_profiles2[item_profiles2.Healthiness != 0]
Healthiness_rating_count_5 = item_profiles2_healthy['Healthiness'].value_counts(normalize=False, sort=True, ascending=True)
Healthiness_rating_count_5

7.0      2
6.0      9
5.0     25
4.0     72
1.0    183
3.0    195
2.0    424
Name: Healthiness, dtype: int64

In [84]:
BMJ_data_all__b_very_healthy = BMJ_data_all__b_new[BMJ_data_all__b_new.fatPoints != 0]
BMJ_data_all__b_very_healthy = BMJ_data_all__b_very_healthy[BMJ_data_all__b_very_healthy.satFatPoints != 0]
BMJ_data_all__b_very_healthy = BMJ_data_all__b_very_healthy[BMJ_data_all__b_very_healthy.sugarPoints != 0]
Healthiness_rating_count_3 = BMJ_data_all__b_very_healthy['Healthiness'].value_counts(normalize=False, sort=True, ascending=True)
Healthiness_rating_count_3

9.0      78
8.0     112
7.0     232
6.0     546
3.0     709
5.0    1146
4.0    1413
Name: Healthiness, dtype: int64

In [139]:
item_profiles2_very_healthy = item_profiles2[item_profiles2.fatPoints != 0]
item_profiles2_very_healthy = item_profiles2_very_healthy[item_profiles2_very_healthy.satFatPoints != 0]
item_profiles2_very_healthy = item_profiles2_very_healthy[item_profiles2_very_healthy.sugarPoints != 0]
Healthiness_rating_count_4 = item_profiles2_very_healthy['Healthiness'].value_counts(normalize=False, sort=True, ascending=True)
Healthiness_rating_count_4

7.0     2
3.0     6
6.0     9
5.0    25
4.0    40
Name: Healthiness, dtype: int64

In [100]:
#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

data = Dataset.load_from_df(ratings, reader)
cross_validate(svd, data, measures=['RMSE'])

{'test_rmse': array([0.77256833, 0.78283884, 0.76130622, 0.7679657 , 0.77750719]),
 'fit_time': (1.8567185401916504,
  1.7990124225616455,
  1.797804355621338,
  1.7939670085906982,
  1.8477959632873535),
 'test_time': (0.07390069961547852,
  0.055928945541381836,
  0.05914044380187988,
  0.056310176849365234,
  0.05687689781188965)}