### Imports

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import VotingRegressor, BaggingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import  SVC
from sklearn.metrics import accuracy_score, mean_squared_error, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn import preprocessing, utils
from random import shuffle
import joblib
from joblib import dump, load
import requests      # Для запросов по API
import json
from time import time
from math import sqrt
import tabula
from bs4 import BeautifulSoup
from urllib import request

### Data Load

In [58]:
df = pd.read_csv('epi_r.csv')

In [59]:
df.head(10)

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,The Best Blts,4.375,948.0,19.0,79.0,1042.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Ham and Spring Vegetable Salad with Shallot Vi...,4.375,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Spicy-Sweet Kumquats,3.75,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Korean Marinated Beef,4.375,170.0,7.0,10.0,1272.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Ham Persillade with Mustard Potato Salad and M...,3.75,602.0,23.0,41.0,1696.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
df.shape

(20052, 680)

### Pre-Processing

In [61]:
df.duplicated('title', keep='first').sum()  # количество дубликатов

2316

In [62]:
df.drop_duplicates('title', keep='first', inplace=True)  # удаляем дубликаты

In [63]:
df.dropna(axis=0, inplace=True)  # удаляем пустые значения

Нам нужно прогнозировать оценку или категорию оценки, используя только ингридиенты и ничего больше. Если модель обучится на других данных, которые не будут передаваться на вход, то точность прогнозов будет заведомо ниже.

Поэтому удалим столбцы, которые не относятся к названиями ингридиентов

In [64]:
rec_df = pd.read_json('full_format_recipes.json', orient='columns')
rec_df.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0


In [65]:
list(df.columns)

['title',
 'rating',
 'calories',
 'protein',
 'fat',
 'sodium',
 '#cakeweek',
 '#wasteless',
 '22-minute meals',
 '3-ingredient recipes',
 '30 days of groceries',
 'advance prep required',
 'alabama',
 'alaska',
 'alcoholic',
 'almond',
 'amaretto',
 'anchovy',
 'anise',
 'anniversary',
 'anthony bourdain',
 'aperitif',
 'appetizer',
 'apple',
 'apple juice',
 'apricot',
 'arizona',
 'artichoke',
 'arugula',
 'asian pear',
 'asparagus',
 'aspen',
 'atlanta',
 'australia',
 'avocado',
 'back to school',
 'backyard bbq',
 'bacon',
 'bake',
 'banana',
 'barley',
 'basil',
 'bass',
 'bastille day',
 'bean',
 'beef',
 'beef rib',
 'beef shank',
 'beef tenderloin',
 'beer',
 'beet',
 'bell pepper',
 'berry',
 'beverly hills',
 'birthday',
 'biscuit',
 'bitters',
 'blackberry',
 'blender',
 'blue cheese',
 'blueberry',
 'boil',
 'bok choy',
 'bon appétit',
 'bon app��tit',
 'boston',
 'bourbon',
 'braise',
 'bran',
 'brandy',
 'bread',
 'breadcrumbs',
 'breakfast',
 'brie',
 'brine',
 'brisk

In [66]:
ingredients_initial = []

for i, row in rec_df.iterrows():
    temp_cats = row.categories
    temp_ingr = row.ingredients
    if type(temp_cats) == list and type(temp_ingr) == list:
        for cat in temp_cats:
            cat = cat.lower()
            cat_title = cat.title()
            for ingr in temp_ingr:
                ingr = ingr.lower()
                if ingr.find(cat) != -1 and cat_title not in ingredients_initial:
                    ingredients_initial.append(cat_title)

ingredients_initial = sorted(ingredients_initial)

print('Начальное число ингредиентов:', len(ingredients_initial))

Начальное число ингредиентов: 398


Уберем все, что не относится к ингредиентам

In [67]:
not_ingredients = ['Bake', 'Blender', 'Boil', 'Bon Appétit', 'Braise', 'Breakfast', 'Broil', 
                   'California', 'Cocktail', 'Deep-Fry', 'Fruit', 'Dessert', 'Dinner', 'Drink', 'Easter', 
                   'Fall', 'Food Processor', 'Fry', 'Game', 'Gourmet', 'Grill', 'Healthy', 
                   'Ice Cream Machine', 'Juicer', 'Kosher', 'Mandoline', 'Marinate', 'Microwave', 
                   'Mixer', 'Mortar And Pestle', 'New York', 'Passover', 'Pasta Maker', 'Pastry', 
                   'Pie', 'Pizza', 'Poach', 'Pressure Cooker', 'Ramekin', 'Raw', 'Salad', 
                   'Sandwich', 'Sauté', 'Seafood', 'Side', 'Simmer', 'Skewer', 'Slow Cooker', 
                   'Smoker', 'Snack', 'Spring', 'Steam', 'Stew', 'Stir-Fry', 'Summer', 'Tart', 
                   'Thanksgiving', 'Vegan', 'Vegetarian', 'Winter']

ingredients = [ingr for ingr in ingredients_initial if ingr not in not_ingredients]

print('Число ингредиентов:', len(ingredients))

Число ингредиентов: 338


In [68]:
ingredients

['Almond',
 'Amaretto',
 'Anchovy',
 'Anise',
 'Apple',
 'Apple Juice',
 'Apricot',
 'Artichoke',
 'Arugula',
 'Asian Pear',
 'Asparagus',
 'Avocado',
 'Bacon',
 'Banana',
 'Barley',
 'Basil',
 'Bass',
 'Bean',
 'Beef',
 'Beef Rib',
 'Beef Shank',
 'Beef Tenderloin',
 'Beer',
 'Beet',
 'Bell Pepper',
 'Berry',
 'Bitters',
 'Blackberry',
 'Blue Cheese',
 'Blueberry',
 'Bok Choy',
 'Bourbon',
 'Bran',
 'Brandy',
 'Bread',
 'Breadcrumbs',
 'Brie',
 'Brine',
 'Brisket',
 'Broccoli',
 'Broccoli Rabe',
 'Brown Rice',
 'Brussel Sprout',
 'Buffalo',
 'Bulgur',
 'Butter',
 'Buttermilk',
 'Butternut Squash',
 'Cabbage',
 'Cake',
 'Calvados',
 'Campari',
 'Candy',
 'Candy Thermometer',
 'Cantaloupe',
 'Capers',
 'Caraway',
 'Cardamom',
 'Carrot',
 'Cashew',
 'Cauliflower',
 'Celery',
 'Chambord',
 'Champagne',
 'Chard',
 'Chartreuse',
 'Cheddar',
 'Cheese',
 'Cherry',
 'Chestnut',
 'Chicken',
 'Chickpea',
 'Chile',
 'Chile Pepper',
 'Chili',
 'Chill',
 'Chive',
 'Chocolate',
 'Cilantro',
 'Cinnam

Создадим датафрейм, оставив только необходимые данные

In [69]:
cols1 = ["title","rating"]

for ing in ingredients:
    ing = ing.lower()
    if ing in list(df.columns):
        cols1.append(ing)

        
df1 = df[cols1]
df1.head()


Unnamed: 0,title,rating,almond,amaretto,anchovy,anise,apple,apple juice,apricot,artichoke,...,whiskey,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini
0,"Lentil, Apple, and Turkey Wrap",2.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,The Best Blts,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Делим датафрейм на обучающую и тестовую выборки

In [70]:
X = df1.drop(columns='rating', axis=1)
y = df1['rating']
X = X.set_index('title') 
X

Unnamed: 0_level_0,almond,amaretto,anchovy,anise,apple,apple juice,apricot,artichoke,arugula,asian pear,...,whiskey,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Lentil, Apple, and Turkey Wrap",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Boudin Blanc Terrine with Red Onion Confit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Potato and Fennel Soup Hodge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spinach Noodle Casserole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Best Blts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chinese Barbecued Spareribs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Artichoke and Parmesan Risotto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Turkey Cream Puff Pie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Snapper on Angel Hair with Citrus Cream,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
lab = preprocessing.LabelEncoder()
y = lab.fit_transform(y)

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size=0.2, random_state=21, stratify=y)

In [17]:
X_train.shape, y_train.shape

((11212, 338), (11212,))

### Regression

Для прогнозирования рейтинга опробуем различные алгоритмы, ансамбли и их гиперпараметры. Затем выберем лучшее решение на основе gridsearch и кроссвалидации и оценим RMSE на тестовой подвыборке.

#### Linear regression

In [18]:
lin_params = {'fit_intercept':[True,False], 
              'normalize':[True,False]}

In [19]:
lin_grid = GridSearchCV(LinearRegression(), lin_params, cv=5, n_jobs=-1)
lin_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LinearRegression(), n_jobs=-1,
             param_grid={'fit_intercept': [True, False],
                         'normalize': [True, False]})

In [20]:
lin_grid.best_score_ 

-6.924729523635896e+21

In [21]:
lin_grid.best_params_ 

{'fit_intercept': False, 'normalize': True}

In [28]:
lin = LinearRegression(fit_intercept= False,
                       normalize= True)
lin.fit(X_train, y_train)
lin_y_pred = lin.predict(X_test)

In [29]:
"RMSE:", np.sqrt(mean_squared_error(y_test, lin_y_pred))

('RMSE:', 135119.70299386815)

#### Decision tree

In [25]:
tr_params = {'max_depth': range(1, 27, 2),
             'max_features': range(5, 31, 5),
             'random_state':[21]
}

In [26]:
tr_grid = GridSearchCV(DecisionTreeRegressor(), tr_params, cv=5, n_jobs=-1) 

In [27]:
tr_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': range(1, 27, 2),
                         'max_features': range(5, 31, 5),
                         'random_state': [21]})

In [30]:
tr_grid.best_params_

{'max_depth': 9, 'max_features': 30, 'random_state': 21}

In [31]:
tr = DecisionTreeRegressor(max_depth=9,
                           max_features = 30,
                           random_state=21)

In [32]:
tr.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=9, max_features=30, random_state=21)

In [33]:
tr_y_pred = tr.predict(X_test)

In [35]:
"RMSE:", np.sqrt(mean_squared_error(y_test, tr_y_pred))

('RMSE:', 1.774382208356212)

#### RandomForest

In [36]:
rf_params = { 
    'n_estimators': (50, 140, 20),
    'max_features': ['sqrt','log2'],
    'max_depth' : (4, 12, 2),
    'random_state' : [21]
}

In [37]:
rf_grid = GridSearchCV(RandomForestRegressor(), rf_params,cv=5, n_jobs=-1)
rf_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': (4, 12, 2),
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': (50, 140, 20), 'random_state': [21]})

In [38]:
rf_grid.best_params_

{'max_depth': 12,
 'max_features': 'sqrt',
 'n_estimators': 140,
 'random_state': 21}

In [39]:
rf = RandomForestRegressor(max_depth= 12,
                           max_features= 'sqrt', 
                           n_estimators= 140,
                           random_state = 21)

In [40]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=12, max_features='sqrt', n_estimators=140,
                      random_state=21)

In [41]:
rf_y_pred = rf.predict(X_test)

In [42]:
"RMSE:", np.sqrt(mean_squared_error(y_test, rf_y_pred))

('RMSE:', 1.7295680234044488)

#### Ensembles

##### VotingRegressor

In [43]:
vot = VotingRegressor(estimators=[('lin', lin),
                            ('tr', tr),
                            ('rf', rf)], 
                       weights = [1, 2, 3])
vot.fit(X_train, y_train)
vot_pred = vot.predict(X_test)

In [44]:
"RMSE:", np.sqrt(mean_squared_error(y_test, vot_pred))

('RMSE:', 22519.93729012002)

##### BaggingRegressor

In [46]:
bgclf = BaggingRegressor(base_estimator=RandomForestRegressor(),
                                  random_state = 21)
bgclf.fit(X_train, y_train)
bgclf_pred = bgclf.predict(X_test)

In [47]:
"RMSE:", np.sqrt(mean_squared_error(y_test, bgclf_pred))

('RMSE:', 1.81451995757827)

##### StackingRegressor

In [48]:
streg_params = {'passthrough':[True, False]
}

In [49]:
streg = StackingRegressor(estimators=[('lin', LinearRegression()),
                                    ('tr', DecisionTreeRegressor()),
                                     ('rf', RandomForestRegressor())])

In [50]:
streg_grid = GridSearchCV(streg, streg_params, cv=5, n_jobs=-1)
streg_grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=StackingRegressor(estimators=[('lin',
                                                      LinearRegression()),
                                                     ('tr',
                                                      DecisionTreeRegressor()),
                                                     ('rf',
                                                      RandomForestRegressor())]),
             n_jobs=-1, param_grid={'passthrough': [True, False]})

In [51]:
streg_grid.best_params_

{'passthrough': False}

In [52]:
streg = StackingRegressor(estimators=[('lin', LinearRegression()),
                                    ('tr', DecisionTreeRegressor()),
                                     ('rf', RandomForestRegressor())],
                            passthrough= False)

In [53]:
streg.fit(X_train, y_train)
streg_pred = streg.predict(X_test)

In [54]:
"RMSE:", np.sqrt(mean_squared_error(y_test, streg_pred))

('RMSE:', 125831.77654513922)

#### Лучшая модель регрессии

RandomForestRegressor

In [55]:
"RMSE:", np.sqrt(mean_squared_error(y_test, rf.predict(X_test)))

('RMSE:', 1.7295680234044488)

In [56]:
naive = pd.Series(df.rating.sum()/len(df.rating) for i in range(len(rf_y_pred)))

In [57]:
"RMSE for naive:", np.sqrt(mean_squared_error(y_test, naive))

('RMSE for naive:', 2.2348763745910847)

BaggingRegressor среди ансамблей

In [58]:
"RMSE:", np.sqrt(mean_squared_error(y_test, bgclf.predict(X_test)))

('RMSE:', 1.81451995757827)

In [59]:
"RMSE for naive:", np.sqrt(mean_squared_error(y_test, naive))

('RMSE for naive:', 2.2348763745910847)

### Classification

Бинаризируем значения целевой переменной путем округления рейтингов до ближайшего целого числа. Это и будут наши классы.

#### Binarization 1

In [73]:
def set_rating_group(rating):
    if rating is not None:
        return round(rating)
    else:
        return 'no rating'
    
df1['rating_group'] = df1['rating'].apply(set_rating_group)
df1

Unnamed: 0,title,rating,almond,amaretto,anchovy,anise,apple,apple juice,apricot,artichoke,...,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini,rating_group
0,"Lentil, Apple, and Turkey Wrap",2.500,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,Boudin Blanc Terrine with Red Onion Confit,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,Potato and Fennel Soup Hodge,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
4,Spinach Noodle Casserole,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
5,The Best Blts,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20046,Chinese Barbecued Spareribs,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
20048,Artichoke and Parmesan Risotto,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
20049,Turkey Cream Puff Pie,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
20050,Snapper on Angel Hair with Citrus Cream,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [23]:
X = df1.drop(columns=['rating', 'rating_group'], axis=1)
y = df1['rating_group']
X = X.set_index('title') 
X

Unnamed: 0_level_0,almond,amaretto,anchovy,anise,apple,apple juice,apricot,artichoke,arugula,asian pear,...,whiskey,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Lentil, Apple, and Turkey Wrap",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Boudin Blanc Terrine with Red Onion Confit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Potato and Fennel Soup Hodge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spinach Noodle Casserole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Best Blts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chinese Barbecued Spareribs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Artichoke and Parmesan Risotto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Turkey Cream Puff Pie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Snapper on Angel Hair with Citrus Cream,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size=0.2, random_state=21, stratify=y)

##### Logreg

In [64]:
log_params = {'penalty':['none', 'l1', 'l2'],
             'solver': ['liblinear', 'saga']}

In [65]:
log_grid = GridSearchCV(LogisticRegression(), log_params, n_jobs=-1)

In [66]:
log_grid.fit(X_train, y_train)

GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'penalty': ['none', 'l1', 'l2'],
                         'solver': ['liblinear', 'saga']})

In [67]:
log_grid.best_params_

{'penalty': 'l1', 'solver': 'liblinear'}

In [68]:
lr = LogisticRegression(solver='liblinear',
                        fit_intercept=False,
                        random_state=21, 
                        penalty='l1')
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)

In [69]:
accuracy_score(y_test, lr_y_pred)

0.6881912236889047

##### SVC

In [70]:
svc_params = {'kernel': ['linear', 'rbf', 'sigmoid'],
            'gamma': ['scale', 'auto'],
            'class_weight':['balanced', None]
}

In [74]:
# svc_grid = GridSearchCV(sklearn.svm.SVC(), 
#                         svc_params, 
#                         n_jobs=-1, 
#                         scoring='accuracy')
# svc = svc_grid.fit(X_train, y_train)

In [None]:
svc_grid.best_params_

In [72]:
svc = sklearn.svm.SVC(C= 10, 
                      class_weight=None, 
                      gamma='auto', 
                      kernel='rbf',
                      probability=True, 
                      random_state=21)

In [73]:
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)

In [75]:
accuracy_score(y_test, svc_pred)

0.691402069211559

##### DecisionTree

In [76]:
tr_params = {'max_depth': range(1, 31),
           'criterion': ['entropy', 'gini'],
            'class_weight':['balanced', None]
}

In [77]:
tr_grid = GridSearchCV(DecisionTreeClassifier(random_state=21),
                       tr_params, 
                       n_jobs=-1, 
                       scoring='accuracy') 

In [78]:
tr = tr_grid.fit(X_train, y_train)

In [79]:
tr_grid.best_params_

{'class_weight': None, 'criterion': 'entropy', 'max_depth': 3}

In [80]:
tr.fit(X_train, y_train)
tr_pred = tr.predict(X_test)

In [81]:
accuracy_score(y_test, tr_pred)

0.691402069211559

##### RandomForest

In [21]:
rf_params = {'max_depth': range(1, 31),
             'n_estimators': [80, 100, 120],
             'criterion': ['entropy', 'gini'],
             'class_weight':['balanced', None]
}

In [22]:
rf_grid = GridSearchCV(RandomForestClassifier(random_state=21), 
                       rf_params, 
                       n_jobs=-1, 
                       scoring='accuracy')

In [23]:
rf = rf_grid.fit(X_train, y_train)

In [24]:
rf_grid.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 30,
 'n_estimators': 120}

In [25]:
rf = RandomForestClassifier(class_weight=None, criterion= 'gini',
                            max_depth=30,
                            n_estimators=120,
                            random_state=21, n_jobs=-1)

In [26]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [27]:
accuracy_score(y_test, rf_pred)

0.6924723510524438

##### Лучшая модель классификации

RandomForestClassifier

Рассчитаем значение accuracy наивного классификатора, в котором вы для всех наблюдений прогнозируете наиболее распространенный класс. Сравним метрики.

In [30]:
naive = pd.Series(df1['rating_group'].sum()/len(df1['rating_group']) for i in range(len(rf_pred)))

In [32]:
accuracy_score(y_test, round(naive))

0.6835533357117374

#### Binarization 2

Снова проведем бинаризацию целевого столбца, преобразовав теперь целые числа в классы bad (0, 1) (невкусное), so-so (2, 3) (нормальное), great (4, 5) (вкусное).

In [74]:
def set_rating_group(rating):
    if 0 <= rating <= 1:
        return 'bad'
    elif 2 <= rating <= 3:
        return 'so-so'
    elif 4 <= rating <= 5:
        return 'great'
    else:
        return 'no rating'
    
df1['rating_group'] = df1['rating_group'].apply(set_rating_group)
df1

Unnamed: 0,title,rating,almond,amaretto,anchovy,anise,apple,apple juice,apricot,artichoke,...,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini,rating_group
0,"Lentil, Apple, and Turkey Wrap",2.500,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,so-so
1,Boudin Blanc Terrine with Red Onion Confit,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
2,Potato and Fennel Soup Hodge,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
4,Spinach Noodle Casserole,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,so-so
5,The Best Blts,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20046,Chinese Barbecued Spareribs,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
20048,Artichoke and Parmesan Risotto,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
20049,Turkey Cream Puff Pie,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
20050,Snapper on Angel Hair with Citrus Cream,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great


In [75]:
X = df1.drop(columns=['rating', 'rating_group'], axis=1)
y = df1['rating_group']
X = X.set_index('title') 
X

Unnamed: 0_level_0,almond,amaretto,anchovy,anise,apple,apple juice,apricot,artichoke,arugula,asian pear,...,whiskey,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Lentil, Apple, and Turkey Wrap",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Boudin Blanc Terrine with Red Onion Confit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Potato and Fennel Soup Hodge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spinach Noodle Casserole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Best Blts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chinese Barbecued Spareribs,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Artichoke and Parmesan Risotto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Turkey Cream Puff Pie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Snapper on Angel Hair with Citrus Cream,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                       test_size=0.2, random_state=21, stratify=y)

##### SVC

In [36]:
svc_params = {'kernel': ['linear', 'rbf', 'sigmoid'],
              'C':[0.1, 1, 1.5, 5],
            'gamma': ['scale', 'auto'],
            'class_weight':['balanced', None]
}

In [39]:
svc = sklearn.svm.SVC(probability=True,random_state=21)

In [41]:
# svc_grid = GridSearchCV(svc, svc_params, cv=5, n_jobs=-1, scoring='accuracy')

# svc_grid.fit(X_train, y_train)

In [None]:
# svc_grid.best_params_

In [42]:
svc = sklearn.svm.SVC(probability=True, C=1,gamma='auto', class_weight='balanced',random_state=21)

In [43]:
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)

In [44]:
accuracy_score(y_test, svc_pred)

0.3831608990367463

##### DecisionTree

In [45]:
tr_params = {'max_depth': range(1, 31, 2),
           'criterion': ['entropy', 'gini'],
            'class_weight':['balanced', None]
}

In [46]:
tr = DecisionTreeClassifier(random_state=21) 

In [47]:
tr_grid = GridSearchCV(tr, tr_params, n_jobs=-1, cv=5, scoring='accuracy') 

In [48]:
tr = tr_grid.fit(X_train, y_train)

In [49]:
tr_grid.best_params_

{'class_weight': None, 'criterion': 'gini', 'max_depth': 7}

In [50]:
tr = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7, random_state=21) 

In [51]:
tr.fit(X_train, y_train)
tr_pred = tr.predict(X_test)

In [52]:
accuracy_score(y_test, tr_pred)

0.8148412415269354

##### RandomForest

In [53]:
rf_params = {'max_depth': range(1, 31),
             'n_estimators': [80, 100, 120],
             'criterion': ['entropy', 'gini'],
             'class_weight':['balanced', None]
}

In [54]:
rf = RandomForestClassifier(random_state=21, n_jobs=-1)

In [55]:
rf_grid = GridSearchCV(rf, rf_params, n_jobs=-1, cv=5, scoring='accuracy')

In [56]:
rf_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=21),
             n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': range(1, 31),
                         'n_estimators': [80, 100, 120]},
             scoring='accuracy')

In [57]:
rf_grid.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 29,
 'n_estimators': 100}

In [77]:
rf2 = RandomForestClassifier(random_state=21)

In [80]:
rf2.fit(X_train, y_train)
rf2_pred = rf2.predict(X_train)
accuracy_score(y_train, rf2_pred)

0.9779700321084552

In [81]:
rf2_pred = rf2.predict(X_test)
accuracy_score(y_test, rf2_pred)

0.7895112379593293

In [72]:
rf1 = RandomForestClassifier(class_weight=None, criterion= 'gini',
                            max_depth=29,
                            n_estimators=100,
                            random_state=21, n_jobs=-1)

In [73]:
rf1.fit(X_train, y_train)
rf1_pred = rf1.predict(X_test)

In [74]:
accuracy_score(y_test, rf1_pred)

0.8180520870495898

##### Лучшая модель


RandomForest

accuracy наивного классификатора

In [61]:
naive = pd.Series('great' for i in range(len(rf_pred)))

In [62]:
accuracy_score(y_test, naive)

0.8102033535497681

#### Other metrics

Лучше спрогнозировать плохой рейтинг, который на самом деле окажется хорошим, чем спрогнозировать хороший рейтинг, который на самом деле окажется плохим.

Заменим метрику accuracy на precision

In [64]:
precision_score(y_test, rf_pred, average="weighted")

0.7276171830600905

#### Ensembles

Для прогнозирования классов с precision попробуем различные алгоритмы, ансамбли и их гиперпараметры.

In [65]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
svc = sklearn.svm.SVC(probability=True,random_state=21)
tr = DecisionTreeClassifier(random_state=21)
rf = RandomForestClassifier(random_state=21)

In [70]:
classifiers = {
    "Voting": VotingClassifier(estimators=[('svc', svc), ('tr', tr), ('rf', rf)], voting='hard',weights = [3, 1, 2]),
    "Bagging": BaggingClassifier(base_estimator = rf, 
                      random_state = 21)
}

In [71]:
nfast = 14      # Don't run the very slow ones
head = list(classifiers.items())[:nfast]

for name, classifier in head:
    start = time()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    precision = precision_score(y_test, y_pred, average="weighted")
    print("{:<15} | precision = {:.3f}".format(name, precision))

Voting          | precision = 0.707
Bagging         | precision = 0.731


сохраним модель

In [75]:
joblib.dump(rf1, 'best_model')

['best_model']

In [76]:
check = joblib.load('best_model')

In [77]:
y_pred = check.predict(X_test)

In [78]:
accuracy_score(y_test, y_pred)

0.8180520870495898

In [79]:
precision_score(y_test, y_pred, average="weighted")

0.7276171830600905

### Пищевая ценность

In [24]:
ing = df1.drop(columns=['title','rating', 'rating_group'])

Соберем в датафрейм всю информацию о пищевой ценности продуктов из подготовленного нами и отфильтрованного набора данных (только столбцы с продуктами)

In [25]:
nutrients_data = pd.DataFrame(columns = ['ingredient', 'nutrientName', 'value', 'unitName'])

for ingredient in ing.columns:
    apiKey = 'FuTBF1NTk2GxtiEKjz53VecxMOhzEPqYcEvhwP4G'
    url = f'https://api.nal.usda.gov/fdc/v1/foods/search?api_key={apiKey}'
    response = requests.get(url, params = {'query': ingredient})
    json = response.json()
    
    try:
        info = json['foods'][0]['foodNutrients']
        for i in info:
            nutrients_data = nutrients_data.append({'ingredient':ingredient, 
                                                    'nutrientName': i.get('nutrientName'), 
                                                    'value': i.get('value'), 
                                                    'unitName':i.get('unitName')},
                                                  ignore_index=True)
    except IndexError:
        pass
    continue

In [26]:
nutrients_data

Unnamed: 0,ingredient,nutrientName,value,unitName
0,almond,Protein,21.0,G
1,almond,Total lipid (fat),55.5,G
2,almond,"Carbohydrate, by difference",18.8,G
3,almond,Energy,614.0,KCAL
4,almond,"Alcohol, ethyl",0.0,G
...,...,...,...,...
12883,zucchini,"Vitamin C, total ascorbic acid",12.6,MG
12884,zucchini,"Sugars, added",0.0,G
12885,zucchini,Cholesterol,0.0,MG
12886,zucchini,"Fatty acids, total trans",0.0,G


 Сохраним только те ингридиенты, которые находятся в этих таблицах.

In [28]:
rdi1 = tabula.read_pdf("Daily-Reference-Values-_DRVs_-under-the-New-NFL.pdf", pages='all')
rdi2 = tabula.read_pdf("Reference-Daily-Intakes-_RDIs_-in-the-New-Nutrition-Facts-Label.pdf", pages='all')

In [29]:
rdi1 = pd.DataFrame(rdi1[0])
rdi1.columns =rdi1.iloc[0]
rdi1.drop(labels=0, axis=0, inplace=True)
rdi1.reset_index(drop=True, inplace=True)
rdi1

Unnamed: 0,Food\rComponent,Unit of measure,Adults and Children ? 4\ryears,Infants through 12\rmonths,Children 1 through 3\ryears,Pregnant women and\rlactating women
0,Fat,Grams (g),178,30.0,239,178.0
1,Saturated fat,Grams (g),120,,210,120.0
2,Cholesterol,Milligrams (mg),300,,300,300.0
3,Total\rcarbohydrates,Grams (g),1275,95.0,2150,1275.0
4,Sodium,Milligrams (mg),2300,,1500,2300.0
5,Dietary Fiber,Grams (g),128,,214,128.0
6,Protein,Grams (g),150,,213,
7,Added sugars,Grams (g),150,,225,150.0


In [30]:
rdi2 = pd.DataFrame(rdi2[0])
rdi2.columns = rdi1.columns
rdi2.drop(labels=0, axis=0, inplace=True)
rdi2.reset_index(drop=True, inplace=True)
rdi2

Unnamed: 0,Food\rComponent,Unit of measure,Adults and Children ? 4\ryears,Infants through 12\rmonths,Children 1 through 3\ryears,Pregnant women and\rlactating women
0,Vitamin A,Micrograms RAE2 (mcg),900.0,500.0,300.0,1300.0
1,Vitamin C,Milligrams (mg),90.0,50.0,15.0,120.0
2,Calcium,Milligrams (mg),1300.0,260.0,700.0,1300.0
3,Iron,Milligrams (mg),18.0,11.0,7.0,27.0
4,Vitamin D,Micrograms (mcg)3,20.0,10.0,15.0,15.0
5,Vitamin E,Milligrams (mg)4,15.0,5.0,6.0,19.0
6,Vitamin K,Micrograms (mcg),120.0,2.5,30.0,90.0
7,Thiamin,Milligrams (mg),1.2,0.3,0.5,1.4
8,Riboflavin,Milligrams (mg),1.3,0.4,0.5,1.6
9,Niacin,Milligrams NE5 (mg),16.0,4.0,6.0,18.0


In [31]:
rdi = pd.concat([rdi1, rdi2])

In [32]:
rdi.reset_index(drop=True, inplace=True)
rdi

Unnamed: 0,Food\rComponent,Unit of measure,Adults and Children ? 4\ryears,Infants through 12\rmonths,Children 1 through 3\ryears,Pregnant women and\rlactating women
0,Fat,Grams (g),178.0,30.0,239.0,178.0
1,Saturated fat,Grams (g),120.0,,210.0,120.0
2,Cholesterol,Milligrams (mg),300.0,,300.0,300.0
3,Total\rcarbohydrates,Grams (g),1275.0,95.0,2150.0,1275.0
4,Sodium,Milligrams (mg),2300.0,,1500.0,2300.0
5,Dietary Fiber,Grams (g),128.0,,214.0,128.0
6,Protein,Grams (g),150.0,,213.0,
7,Added sugars,Grams (g),150.0,,225.0,150.0
8,Vitamin A,Micrograms RAE2 (mcg),900.0,500.0,300.0,1300.0
9,Vitamin C,Milligrams (mg),90.0,50.0,15.0,120.0


In [33]:
n_list = nutrients_data['nutrientName'].unique()
rdi_list = rdi[['Food\rComponent', 'Adults and Children ? 4\ryears']]
nut_df = pd.DataFrame(columns = ['nutrient', 'nutrien_api', 'day_value'])

for i,k in rdi_list.iterrows():
    for j in n_list:
        if k[0] in str(j):
             nut_df = nut_df.append({'nutrient':k[0],
                                    'nutrien_api':str(j),
                                    'day_value': k[1]},
                                   ignore_index=True)
        elif "Vitamin B" in k[0] and "Vitamin B" in str(j):
            nut_df = nut_df.append({'nutrient':k[0],
                                    'nutrien_api':str(j),
                                   'day_value': k[1]},
                                   ignore_index=True)

In [34]:
nut_df

Unnamed: 0,nutrient,nutrien_api,day_value
0,Fat,"Fatty acids, total saturated",178.0
1,Fat,"Fatty acids, total monounsaturated",178.0
2,Fat,"Fatty acids, total polyunsaturated",178.0
3,Fat,"Fatty acids, total trans",178.0
4,Fat,"Fatty acids, total trans-monoenoic",178.0
5,Fat,"Fatty acids, total trans-polyenoic",178.0
6,Cholesterol,Cholesterol,300.0
7,Sodium,"Sodium, Na",2300.0
8,Protein,Protein,150.0
9,Vitamin A,"Vitamin A, RAE",900.0


подкорректируем fat

In [35]:
nut_df.loc[0]['nutrient'] = 'Saturated fat'
nut_df.loc[0]['day_value'] = 120
nut_df.loc[1]['nutrien_api'] = 'Total lipid (fat)'

удалим лишние строки

In [36]:
nut_df.drop([2,3,4,5, 10, 15,16,17,19, 21, 22, 27, 28, 29,31], inplace=True)

In [37]:
nut_df

Unnamed: 0,nutrient,nutrien_api,day_value
0,Saturated fat,"Fatty acids, total saturated",120.0
1,Fat,Total lipid (fat),178.0
6,Cholesterol,Cholesterol,300.0
7,Sodium,"Sodium, Na",2300.0
8,Protein,Protein,150.0
9,Vitamin A,"Vitamin A, RAE",900.0
11,Vitamin C,"Vitamin C, total ascorbic acid",90.0
12,Calcium,"Calcium, Ca",1300.0
13,Iron,"Iron, Fe",18.0
14,Vitamin D,Vitamin D (D2 + D3),20.0


In [38]:
list(nut_df['nutrien_api'])

['Fatty acids, total saturated',
 'Total lipid (fat)',
 'Cholesterol',
 'Sodium, Na',
 'Protein',
 'Vitamin A, RAE',
 'Vitamin C, total ascorbic acid',
 'Calcium, Ca',
 'Iron, Fe',
 'Vitamin D (D2 + D3)',
 'Vitamin E (alpha-tocopherol)',
 'Vitamin K (phylloquinone)',
 'Thiamin',
 'Riboflavin',
 'Niacin',
 'Vitamin B-6',
 'Vitamin B-12',
 'Biotin',
 'Pantothenic acid',
 'Phosphorus, P',
 'Magnesium, Mg',
 'Zinc, Zn',
 'Selenium, Se',
 'Copper, Cu',
 'Manganese, Mn',
 'Potassium, K',
 'Choline, total']

In [39]:
nutrients_data = nutrients_data.loc[nutrients_data['nutrientName'].isin(list(nut_df['nutrien_api']))]

In [40]:
nutrients_data

Unnamed: 0,ingredient,nutrientName,value,unitName
0,almond,Protein,21.00,G
1,almond,Total lipid (fat),55.50,G
10,almond,"Calcium, Ca",347.00,MG
11,almond,"Iron, Fe",3.49,MG
12,almond,"Magnesium, Mg",279.00,MG
...,...,...,...,...
12880,zucchini,"Potassium, K",222.00,MG
12881,zucchini,"Sodium, Na",0.00,MG
12883,zucchini,"Vitamin C, total ascorbic acid",12.60,MG
12885,zucchini,Cholesterol,0.00,MG


Конвертируем все значения в % от суточной нормы потребления.

In [41]:
nut_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 41
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   nutrient     27 non-null     object
 1   nutrien_api  27 non-null     object
 2   day_value    27 non-null     object
dtypes: object(3)
memory usage: 864.0+ bytes


In [42]:
nut_df['day_value'] = nut_df['day_value'].apply(lambda x: float(str(x).replace(',','')))

In [43]:
nutrients_data.value = nutrients_data['value'].apply(lambda x: round(x / nut_df['day_value'] * 100, 2))
nutrients_data

Unnamed: 0,ingredient,nutrientName,value,unitName
0,almond,Protein,17.50,G
1,almond,Total lipid (fat),46.25,G
10,almond,"Calcium, Ca",289.17,MG
11,almond,"Iron, Fe",2.91,MG
12,almond,"Magnesium, Mg",232.50,MG
...,...,...,...,...
12880,zucchini,"Potassium, K",185.00,MG
12881,zucchini,"Sodium, Na",0.00,MG
12883,zucchini,"Vitamin C, total ascorbic acid",10.50,MG
12885,zucchini,Cholesterol,0.00,MG


Сохраним измененный датафрейм в CSV-файл, который мы будем использовать в своей основной программе.

In [45]:
nutrients_data.to_csv('Nutrients.csv')

### Похожие рецепты


Для каждого рецепта из набора данных найдем ссылку на сайте epicurious.com и подробную информацию о нем (название рецепта, рейтинг на платформе и URl). 

Для начала проверим все ли названия рецептов написаны корректно

In [20]:
rec_df

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.500,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.750,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.000,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0
...,...,...,...,...,...,...,...,...,...,...,...
20125,[Beat whites in a bowl with an electric mixer ...,2.0,2004-08-20 04:00:00+00:00,"[Mixer, Cheese, Egg, Fry, Cocktail Party, Parm...",28.0,,2.0,3.125,Parmesan Puffs,"[2 large egg whites, 3 oz Parmigiano-Reggiano,...",64.0
20126,[Bring broth to simmer in saucepan.Remove from...,28.0,2008-02-28 22:06:54+00:00,"[Side, Kid-Friendly, High Fiber, Dinner, Parme...",671.0,Cooking the artichokes with the rice infuses t...,22.0,4.375,Artichoke and Parmesan Risotto,"[5 1/2 cups (or more) low-salt chicken broth, ...",583.0
20127,"[Using a sharp knife, cut a shallow X in botto...",38.0,2005-10-21 18:21:20+00:00,"[Onion, Poultry, turkey, Vegetable, Bake, Kid-...",563.0,,31.0,4.375,Turkey Cream Puff Pie,"[1 small tomato, 1 small onion, finely chopped...",652.0
20128,[Heat 2 tablespoons oil in heavy medium skille...,24.0,2004-08-20 04:00:00+00:00,"[Milk/Cream, Citrus, Dairy, Fish, Garlic, Past...",631.0,"Sharon Hooykaas of Los Alamitos, California, w...",45.0,4.375,Snapper on Angel Hair with Citrus Cream,"[4 tablespoons olive oil, 4 shallots, thinly s...",517.0


In [21]:
rec_df.drop_duplicates(subset='title', keep='first', inplace=True)

In [22]:
set(df1.title).difference(set(rec_df.title))

{'Barbecue Pork Kebabs With\r\n                Blistered-Chile–Pumpkin Seed Salsa ',
 'Coconut-Marinated Short Rib Kebabs\r\n                With Peanut-Chile Oil ',
 'Lamb Chops Scottadito\r\n                With Charred Cherry Tomatoes ',
 'Patatine e Carciofi Arrosto\r\n                roasted Potatoes and Artichokes '}

видим разрывы строк в некоторых рецептах

In [23]:
df1[df1['title'].str.contains("Coconut-Marinated Short Rib Kebab")]

Unnamed: 0,title,rating,almond,amaretto,anchovy,anise,apple,apple juice,apricot,artichoke,...,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini,rating_group
1368,Coconut-Marinated Short Rib Kebabs\r\n ...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great


In [24]:
recipe = 'Coconut-Marinated%20Short%20Rib%20Kebabs%20With%20Peanut-Chile%20Oil%20'

In [25]:
def pre_processing(title):
    recipe = title.replace('\r\n                ', '%20')
    recipe = recipe.rstrip()
    recipe = recipe.replace(' ', '%20')
    recipe = recipe.replace(',', '%2C')

    return recipe

In [26]:
pre_processing('Coconut-Marinated Short Rib Kebabs\r\n                With Peanut-Chile Oil ')

'Coconut-Marinated%20Short%20Rib%20Kebabs%20With%20Peanut-Chile%20Oil'

In [27]:
def get_url(recipe):
    url = f'https://www.epicurious.com/search/{recipe}?content=recipe'
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", 
              "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"}            
    r = request.Request(url, headers=headers)
    soup = BeautifulSoup(request.urlopen(r).read(), 'html.parser')
    try:
        rating = soup.select('div.results-group article.recipe-content-card a.view-complete-item')
        rating = [i['href']for i in rating]
        return f'https://www.epicurious.com{rating[0]}'
    except (HTTPError, URLError):
        return []

In [28]:
get_url(recipe)

'https://www.epicurious.com/recipes/food/views/coconut-marinated-short-rib-kebabs-with-peanut-chile-oil'

In [29]:
def get_rating(recipe):
    url = f'https://www.epicurious.com/search/{recipe}?content=recipe'
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", 
              "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"}            
    r = request.Request(url, headers=headers)
    soup = BeautifulSoup(request.urlopen(r).read(), 'html.parser')
    try:
        rating = soup.select('div.results-group article.recipe-content-card dl.recipes-ratings-summary')
        rating = [i['data-reviews-rating']for i in rating]
        return rating[0]
    except (HTTPError, URLError):
        return []

In [30]:
def pre_processing_for_google(title):
    recipe = title.replace('\r\n                ', '+')
    recipe = recipe.rstrip()
    recipe = recipe.replace(' ', '+')
    recipe = recipe.replace('-', '+')
    return recipe

In [31]:
def pre_processing_for_google(title):
    recipe = title.replace('\r\n                ', '+')
    recipe = recipe.rstrip()
    recipe = recipe.replace(' ', '+')
    recipe = recipe.replace(',', '%2C')
    recipe = recipe.encode('ascii', 'ignore').decode('ascii')
    recipe = recipe + '+recipe'
    return f"https://google.com/search?q={recipe}"

In [32]:
gathered_df = pd.DataFrame(df1['title'],columns=['title'])
gathered_df['rating'] = 0
gathered_df['url']=0

In [33]:
gathered_df.reset_index(drop=True, inplace=True)

In [39]:
result = pd.read_csv('result.csv',index_col='Unnamed: 0')
result

Unnamed: 0,title,url,rating,almond,amaretto,anchovy,anise,apple,apple juice,apricot,...,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini,rating_group
0,"Lentil, Apple, and Turkey Wrap",https://www.epicurious.com/recipes/food/views/...,2.500,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,so-so
1,Boudin Blanc Terrine with Red Onion Confit,https://www.epicurious.com/recipes/food/views/...,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
2,Potato and Fennel Soup Hodge,https://www.epicurious.com/recipes/food/views/...,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
3,Spinach Noodle Casserole,https://www.epicurious.com/recipes/food/views/...,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,so-so
4,The Best Blts,https://www.epicurious.com/recipes/food/views/...,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14010,Chinese Barbecued Spareribs,0,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
14011,Artichoke and Parmesan Risotto,0,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
14012,Turkey Cream Puff Pie,0,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
14013,Snapper on Angel Hair with Citrus Cream,0,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great


In [49]:
for i in range(14000, 14015):
    recipe = result.title[i]
    recipe = pre_processing(recipe)
    try:
        result.loc[i,'url'] = get_url(recipe)
    except:
        result.loc[i,'url'] = pre_processing_for_google(recipe)

Сохраним новый датафрейм в CSV-файл, который мы будем использовать в своей основной программе.

In [55]:
result.to_csv('result.csv',index=False)

In [56]:
pd.read_csv('result.csv')

Unnamed: 0,title,url,rating,almond,amaretto,anchovy,anise,apple,apple juice,apricot,...,white wine,whole wheat,wild rice,wine,wok,yellow squash,yogurt,yuca,zucchini,rating_group
0,"Lentil, Apple, and Turkey Wrap",https://www.epicurious.com/recipes/food/views/...,2.500,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,so-so
1,Boudin Blanc Terrine with Red Onion Confit,https://www.epicurious.com/recipes/food/views/...,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
2,Potato and Fennel Soup Hodge,https://www.epicurious.com/recipes/food/views/...,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
3,Spinach Noodle Casserole,https://www.epicurious.com/recipes/food/views/...,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,so-so
4,The Best Blts,https://www.epicurious.com/recipes/food/views/...,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14010,Chinese Barbecued Spareribs,https://www.epicurious.com/recipes/food/views/...,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
14011,Artichoke and Parmesan Risotto,https://www.epicurious.com/recipes/food/views/...,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
14012,Turkey Cream Puff Pie,https://www.epicurious.com/recipes/food/views/...,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
14013,Snapper on Angel Hair with Citrus Cream,https://www.epicurious.com/recipes/food/views/...,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,great
