<span style='color: #021497; font-size: 3em'><b>I. Предварительная настройка</b></span>

## Подключение необходимых библиотек

In [1]:
#Библиотеки для работы с парсингом сайта
import requests
from io import StringIO
import json

#Библиотеки для работы с данными
import pandas as pd
import numpy as np
import math
import statistics
import warnings
from tqdm.notebook import tqdm
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

#Библиотеки для работы с моделями ML
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, BaggingRegressor, StackingRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, make_scorer, root_mean_squared_error
from sklearn.base import is_classifier, is_regressor

## Настройка подключения к `FoodData Central`

In [2]:
proxy_ip = "191.102.181.40"
proxy_port = "9584"
proxy_user = "CK3LWQ"
proxy_pass = "c0wg7U"

proxy_url = f"http://{proxy_user}:{proxy_pass}@{proxy_ip}:{proxy_port}"
proxies = {
    "http": proxy_url,
    "https": proxy_url,
}

api_key = 'O6EQ2mEKz27UlgihujBjt1RqJRc8ZteNiNLWBdQT'

<span style='color: #021497; font-size: 3em'><b>II. Создание модели прогнозирования рейтинга блюда исходя из его ингредиентов</b></span>

<span style='color: #BD2467; font-size: 3em;'><b>1. Подготовка данных</b></span>

<span style='color: #03C568; font-size: 2em;'><b>1.1. Считаем данные из файла `epic_r.csv`</b></span>

In [3]:
input_file_epi_r = 'epi_r.csv'
df = pd.read_csv(input_file_epi_r)
display(df.info())
display(df.head(2))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 680 entries, title to turkey
dtypes: float64(679), object(1)
memory usage: 104.0+ MB


None

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<span style='color: #03C568; font-size: 2em;'><b>1.2. Оставляем только столбцы с ингредиентами</b></span>

## Сократим названия колонок, где используется символ `/` и слово `or`

In [4]:
for column in df.columns:
    if '/' in column:
        new_name_column = column.split('/')[0]
        df.rename(columns={column: new_name_column}, inplace=True)
    if ' or ' in column:
        new_name_column = column.split(' or ')[0]
        df.rename(columns={column: new_name_column}, inplace=True)

## Теперь найдём все названия колонок, которые являются ингридиентами в `FoodData Central`

In [5]:
list_ingredients = []
session = requests.Session()
my_params = {
    "api_key": api_key,
    "query": ''
}
for column in tqdm(df.columns):
    if column in ['avocado', 'chile pepper', 'wok']:
        list_ingredients.append(column)
        continue
    my_params['query'] = column
    my_url = f'https://api.nal.usda.gov/fdc/v1/foods/search'
    response = session.get(url=my_url, proxies=proxies, params=my_params, timeout=10)
    json_tmp = response.json()
    if 'foods' in json_tmp:
        for food in json_tmp['foods']:
            if 'description' in food:
                if column.lower() == food['description'].lower():
                    list_ingredients.append(column)
                    break
list_ingredients = list(set(list_ingredients))

  0%|          | 0/680 [00:00<?, ?it/s]

## Удаляем все лишние столбцы

In [6]:
for column in df.columns:
    if column not in list_ingredients and column != 'rating' and column in df.columns:
        df.drop(columns=column, inplace=True)

<span style='color: #BA3F0D; font-size: 2em; background: #EAF785;'><b>Сделаем проверку</b></span>

In [7]:
df.info()
display(df.head(2))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 241 entries, rating to turkey
dtypes: float64(241)
memory usage: 36.9 MB


Unnamed: 0,rating,apple,apple juice,apricot,artichoke,arugula,asparagus,avocado,bacon,banana,...,whiskey,whole wheat,wild rice,wok,yellow squash,yogurt,yuca,zucchini,snack,turkey
0,2.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
columns_name_for_check = ['title', 'leftovers', 'california', 'dominican republic', 'protein', 'low cholesterol',
                         'pan-fry', 'fruit', 'stew', 'braise', 'milk', 'jam']
for column in columns_name_for_check:
    if column in list(df.columns):
        print(f"'{column}' in list(df.columns)")
    else:
        print(f"'{column}' not in list(df.columns)")

'title' not in list(df.columns)
'leftovers' not in list(df.columns)
'california' not in list(df.columns)
'dominican republic' not in list(df.columns)
'protein' not in list(df.columns)
'low cholesterol' not in list(df.columns)
'pan-fry' not in list(df.columns)
'fruit' not in list(df.columns)
'stew' not in list(df.columns)
'braise' not in list(df.columns)
'milk' in list(df.columns)
'jam' in list(df.columns)


<span style='color: #03C568; font-size: 2em;'><b>1.3. Реализовать это можно также используя Pipeline</b></span>

In [9]:
class ColumnCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        for column in X.columns:
            if '/' in column:
                X.rename(columns={column: column.split('/')[0]}, inplace=True)
            if ' or ' in column:
                X.rename(columns={column: column.split(' or ')[0]}, inplace=True)
        return X

class IngredientFilter(BaseEstimator, TransformerMixin):
    def __init__(self, api_key, proxies=None):
        self.api_key = api_key
        self.proxies = proxies
        self.session = requests.Session()
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        list_ingredients = []
        my_params = {"api_key": self.api_key, "query": ''}
        for column in tqdm(X.columns):
            if column in ['avocado', 'chile pepper', 'wok']:
                list_ingredients.append(column)
                continue
            my_params['query'] = column
            my_url = 'https://api.nal.usda.gov/fdc/v1/foods/search'
            try:
                response = self.session.get(url=my_url, proxies=self.proxies, params=my_params, timeout=10)
                json_tmp = response.json()
                if 'foods' in json_tmp:
                    for food in json_tmp['foods']:
                        if 'description' in food:
                            if column.lower() == food['description'].lower():
                                list_ingredients.append(column)
                                break
            except Exception:
                continue
        self.list_ingredients_ = list(set(list_ingredients))
        return X[self.list_ingredients_ + ['rating']]

df_input = pd.read_csv(input_file_epi_r)
pipeline = Pipeline([
    ('clean_columns', ColumnCleaner()),
    ('filter_ingredients', IngredientFilter(api_key=api_key, proxies=proxies))
])

df_pipeline = pipeline.fit_transform(df_input)
display(df_pipeline)

  0%|          | 0/680 [00:00<?, ?it/s]

Unnamed: 0,broccoli,margarita,sausage,tarragon,sesame oil,beef tenderloin,cantaloupe,caraway,fruit juice,onion,...,quiche,tapioca,cookie,buffalo,octopus,tomato,salad dressing,salad,milk,rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.500
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.375
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.750
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.125
20048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.375
20049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.375
20050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.375


<span style='color: #BD2467; font-size: 3em;'><b>2. Модели регрессии</b></span>

##  Разделим наши данные на обучающую выборку и тестовую

In [10]:
X = df.drop(columns=['rating'])
X = X.fillna(0)
y = df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## Напишем функцию для поиска лучших гиперпараметров при перекрёстной проверки

In [11]:
def crossval_regressor(n_splits, X, y, model, params, scoring):
    gs_model = GridSearchCV(estimator=model, param_grid=params, scoring=scoring, cv=n_splits, n_jobs=-1)
    gs_model.fit(X, y.values.ravel())
    y_predict = gs_model.predict(X)
    rmse = root_mean_squared_error(y.values, y_predict)
    return gs_model, rmse

## Создадим DataFrame для сбора статистики наших моделей

In [12]:
df_regression = pd.DataFrame(columns=['model', 'rmse'])

<span style='color: #03C568; font-size: 2em;'><b>2.1. Рассмотрим различные алгоритмы регрессии и протестируем разные параметры для них</b></span>

## Линейная регрессия / Linear Regression

In [13]:
linear_regression_params = {
    'fit_intercept': [True, False]
}
model_linear_regression = LinearRegression()
model_linear_regression, rmse_m_l_r = crossval_regressor(10, X_train, y_train, model_linear_regression, linear_regression_params, 'r2')
df_regression.loc[len(df_regression)] = ['Linear Regression', rmse_m_l_r]

## Регрессия решающего дерева / Decision Tree Regressor

In [14]:
decision_tree_regressor_params = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'ccp_alpha': [0.0, 0.01, 0.1]
}
model_decision_tree_regressor = DecisionTreeRegressor()
model_decision_tree_regressor, rmse_d_t_r = crossval_regressor(10, X_train, y_train, model_decision_tree_regressor, decision_tree_regressor_params, 'r2')
df_regression.loc[len(df_regression)] = ['Decision Tree Regressor', rmse_d_t_r]

## Регрессия рандомного леса / RandomForestRegressor

In [15]:
random_forest_regressor_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10]          
}
model_random_forest_regressor = RandomForestRegressor()
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model_random_forest_regressor, rmse_r_f_r = crossval_regressor(10, X_train, y_train, 
                                                                   model_random_forest_regressor, random_forest_regressor_params, 'r2')
df_regression.loc[len(df_regression)] = ['Random Forest Regressor', rmse_r_f_r]

## Подведём итоги:

In [16]:
df_regression = df_regression.sort_values(by=['rmse'])
display(df_regression)

Unnamed: 0,model,rmse
2,Random Forest Regressor,1.183183
0,Linear Regression,1.244967
1,Decision Tree Regressor,1.252544


## Лучшая регрессионная модель `Random Forest Regressor`. Найдём её RMSE на тестовой выборке.

In [17]:
y_predict = model_random_forest_regressor.predict(X_test)
rmse_best_model_regressor = root_mean_squared_error(y_test.values, y_predict)
print(rmse_best_model_regressor)

1.2845109405623631


<span style='color: #03C568; font-size: 2em;'><b>2.2. Рассмотрим различные ансамбли для алгоритмов регрессии и протестируем разные параметры для них</b></span>

## Voting Regressor

In [18]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model_vot_regressor = VotingRegressor(estimators=[('lin_reg', model_linear_regression), ('dt', model_decision_tree_regressor),('rf', model_random_forest_regressor)], n_jobs=-1)
    model_vot_regressor.fit(X_train, y_train.values.ravel())
    y_predict_vot_regressor = model_vot_regressor.predict(X_train)
    rmse_vot_regressor = root_mean_squared_error(y_train.values, y_predict_vot_regressor)
    df_regression.loc[len(df_regression)] = ['Voting Regressor', rmse_vot_regressor]

## Bagging Regressor

In [19]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model_bagging_regressor = BaggingRegressor(model_linear_regression, n_estimators=10, random_state=21, n_jobs=-1)
    model_bagging_regressor.fit(X_train, y_train.values.ravel())
    y_predict_baggingt_regressor = model_bagging_regressor.predict(X_train)
    rmse_bagging_regressor = root_mean_squared_error(y_train.values, y_predict_baggingt_regressor)
    df_regression.loc[len(df_regression)] = ['Bagging Regressor', rmse_bagging_regressor]

## Stacking Regressor

In [20]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    estimators=[('lin_reg', model_linear_regression), ('dt', model_decision_tree_regressor)]
    model_skacking_regressor = StackingRegressor(estimators=estimators, final_estimator=model_random_forest_regressor)
    model_skacking_regressor.fit(X_train, y_train.values.ravel())
    y_predict_skacking_regressor = model_skacking_regressor.predict(X_train)
    rmse_skacking_regressor = root_mean_squared_error(y_train.values, y_predict_skacking_regressor)
    df_regression.loc[len(df_regression)] = ['Bagging Regressor', rmse_skacking_regressor]

## Подведём итоги:

In [21]:
df_regression = df_regression.sort_values(by=['rmse'])
display(df_regression)

Unnamed: 0,model,rmse
2,Random Forest Regressor,1.183183
3,Voting Regressor,1.217217
5,Bagging Regressor,1.243658
0,Linear Regression,1.244967
4,Bagging Regressor,1.246511
1,Decision Tree Regressor,1.252544


## Лучший ансамбль для алгоритмов регрессии `VotingRegressor`. Найдём его RMSE на стестовой выборке.

In [22]:
y_predict = model_vot_regressor.predict(X_test)
rmse_best_ensemble_regressor = root_mean_squared_error(y_test.values, y_predict)
print(rmse_best_ensemble_regressor)

1.279615715370315


<span style='color: #03C568; font-size: 2em;'><b>2.3. Рассчитаем RMSE для наивного регрессора, который предсказывает средний рейтинг.</b></span>

In [23]:
y_predict = [statistics.mean(y_test)] * len(y_test)
rmse_naive_regressor = root_mean_squared_error(y_test.values, y_predict)
print(rmse_naive_regressor)

1.3406047250383304


<span style='color: #BD2467; font-size: 3em;'><b>3. Модели классификации</b></span>

<span style='color: #03C568; font-size: 2em;'><b>3.1. Бинаризируем целевую колонку окгрулив значения до ближайщего целого.</b></span>

In [24]:
y2 = pd.DataFrame([int(math.floor(el + 0.5)) for el in y])
y2_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
y2 = pd.DataFrame(y2_encoder.fit_transform(y2), columns=['0', '1', '2', '3', '4', '5'])
display(y2)
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.2, random_state=21, stratify=y)

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
20047,0.0,0.0,0.0,1.0,0.0,0.0
20048,0.0,0.0,0.0,0.0,1.0,0.0
20049,0.0,0.0,0.0,0.0,1.0,0.0
20050,0.0,0.0,0.0,0.0,1.0,0.0


<span style='color: #03C568; font-size: 2em;'><b>3.2. Рассмотрим различные алгоритмы классификации и протестируем разные параметры для них</b></span>

## Метод опорных векторов / SVM

In [25]:
svm_params = {
    'kernel': ['linear', 'rbf']
}
y_train_labels = np.argmax(y_train.values, axis=1)
model_svm = SVC()
model_svm = GridSearchCV(estimator=model_svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=-1)
model_svm.fit(X_train, y_train_labels)
y_predict_svm = model_svm.predict(X_train)
accuracy_svm = accuracy_score(y_train_labels, y_predict_svm)
print(f'SVM (accuracy) = {accuracy_svm}')

SVM (accuracy) = 0.7040084782744218


## Decision Tree Classifier / Классификатор решающего дерева

In [26]:
decision_tree_grid = {
    'max_depth': [i for i in range(1, 10)],
    'class_weight' : ['balanced', None],
    'criterion' : ['entropy', 'gini'],
    'random_state': [21]
}

model_decision_tree = DecisionTreeClassifier()
model_decision_tree = GridSearchCV(estimator=model_decision_tree, param_grid=decision_tree_grid, scoring='accuracy', cv=2, n_jobs=-1)
model_decision_tree.fit(X_train, y_train_labels)
y_predict_decision_tree = model_decision_tree.predict(X_train)
accuracy_decision_tree = accuracy_score(y_train_labels, y_predict_decision_tree)
print(f'Decision Tree (accuracy) = {accuracy_decision_tree}')

Decision Tree (accuracy) = 0.6744591983043451


## Random Forest Classifier / Классификатор случайного леса

In [27]:
decision_random_forest = {
    'n_estimators': [5, 10, 50],
    'max_depth': [i for i in range(1, 10)],
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini'],
    'random_state': [21]
}

model_random_forest = RandomForestClassifier()
model_random_forest = GridSearchCV(estimator=model_random_forest, param_grid=decision_random_forest, scoring='accuracy', cv=2, n_jobs=-1)
model_random_forest.fit(X_train, y_train_labels)
y_predict_random_forest = model_random_forest.predict(X_train)
accuracy_random_forest = accuracy_score(y_train_labels, y_predict_random_forest)
print(f'Decision Tree (accuracy) = {accuracy_random_forest}')

Decision Tree (accuracy) = 0.6657939031232467


## Лучшая модель классификатора - `SVM`

In [28]:
y_predict = model_svm.predict(X_test)
y_test_labels = np.argmax(y_test.values, axis=1)
accuracy_best = accuracy_score(y_test_labels, y_predict)
print(accuracy_best)

0.6741460982298678


<span style='color: #03C568; font-size: 2em;'><b>3.3. Рассчитаем accuracy для наивного классификатора, который предсказывает средний рейтинг.</b></span>

In [29]:
y_predict_naive = [statistics.mode(y_train_labels)] * len(y_train_labels)
accuracy_naive = accuracy_score(y_train_labels, y_predict_naive)
print(accuracy_naive)

0.6576896702200611


<span style='color: #03C568; font-size: 2em;'><b>3.4. Бинаризируем целевую колонку в классы ‘bad’, ‘so-so’, ‘great’</b></span>

In [30]:
y3 = [int(math.floor(el + 0.5)) for el in y]
y3 = pd.Series(['great' if el >=4 else ('so-so' if el >=2 else 'bad') for el in y3])
X_train, X_test, y_train, y_test = train_test_split(X, y3, test_size=0.2, random_state=21, stratify=y)

<span style='color: #03C568; font-size: 2em;'><b>3.5. Рассмотрим различные алгоритмы классификации и протестируем разные параметры для них</b></span>

## Метод опорных векторов / SVM

In [31]:
svm_params = {
    'kernel': ['linear', 'rbf']
}
model_svm = SVC()
model_svm = GridSearchCV(estimator=model_svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=-1)
model_svm.fit(X_train, y_train)
y_predict_svm = model_svm.predict(X_train)
accuracy_svm = accuracy_score(y_train, y_predict_svm)
print(f'SVM (accuracy) = {accuracy_svm}')

SVM (accuracy) = 0.8172807181597157


## Decision Tree Classifier / Классификатор решающего дерева

In [32]:
decision_tree_grid = {
    'max_depth': [i for i in range(1, 10)],
    'class_weight' : ['balanced', None],
    'criterion' : ['entropy', 'gini'],
    'random_state': [21]
}

model_decision_tree = DecisionTreeClassifier()
model_decision_tree = GridSearchCV(estimator=model_decision_tree, param_grid=decision_tree_grid, scoring='accuracy', cv=2, n_jobs=-1)
model_decision_tree.fit(X_train, y_train)
y_predict_decision_tree = model_decision_tree.predict(X_train)
accuracy_decision_tree = accuracy_score(y_train, y_predict_decision_tree)
print(f'Decision Tree (accuracy) = {accuracy_decision_tree}')

Decision Tree (accuracy) = 0.8000124680506203


## Random Forest Classifier / Классификатор случайного леса

In [33]:
decision_random_forest = {
    'n_estimators': [5, 10, 50],
    'max_depth': [i for i in range(1, 10)],
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini'],
    'random_state': [21]
}

model_random_forest = RandomForestClassifier()
model_random_forest = GridSearchCV(estimator=model_random_forest, param_grid=decision_random_forest, scoring='accuracy', cv=2, n_jobs=-1)
model_random_forest.fit(X_train, y_train)
y_predict_random_forest = model_random_forest.predict(X_train)
accuracy_random_forest = accuracy_score(y_train, y_predict_random_forest)
print(f'Random Forest (accuracy) = {accuracy_random_forest}')

Random Forest (accuracy) = 0.799389065519606


## Лучшая модель классификатора - SVM

In [34]:
y_predict = model_svm.predict(X_test)
accuracy_best = accuracy_score(y_test, y_predict)
print(accuracy_best)

0.8007978060334081


<span style='color: #03C568; font-size: 2em;'><b>3.6. Рассчитаем accuracy для наивного классификатора</b></span>

In [35]:
y_predict_naive = [statistics.mode(y3)] * len(y_test)
accuracy_naive = accuracy_score(y_test, y_predict_naive)
print(statistics.mode(y3))
print(accuracy_naive)

great
0.7933183744702069


<span style='color: #03C568; font-size: 2em;'><b>3.7. Ответим на вопрос что хуже: предсказать плохой рейтинг для блюда, которое хорошо в реальной жизни, или предсказать хороший рейтинг для блюда, которое плохое в реальной жизни? </b></span>

## Если мы оценим хорошее блюдо как плохое, то пользователи не попробуют хорошее блюдо. Это неприятно. Но если мы оценим плохое блюдо как хорошее, то пользователи его попробуют и будут сильно огорчены. И это важнее избежать. Поэтому мы заменим метрику `accuracy` на метрику `precision`, чтобы правильно предсказывать `great`

In [36]:
precision_great = make_scorer(precision_score, labels=['great'], average='macro')

<span style='color: #03C568; font-size: 2em;'><b>3.8. Рассмотрим различные алгоритмы классификации и протестируем разные параметры для них</b></span>

## Метод опорных векторов / SVM

In [37]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    svm_params = {
        'kernel': ['linear', 'rbf']
    }
    model_svm = SVC()
    model_svm = GridSearchCV(estimator=model_svm, param_grid=svm_params, scoring=precision_great, cv=2, n_jobs=-1)
    model_svm.fit(X_train, y_train)
    y_predict_svm = model_svm.predict(X_train)
    precision_svm = precision_score(y_train, y_predict_svm, labels=['great'], average='macro')
    print(f'SVM (precision) = {precision_svm}')

SVM (precision) = 0.8037519872813991


## Decision Tree Classifier / Классификатор решающего дерева

In [38]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    decision_tree_grid = {
        'max_depth': [i for i in range(1, 10)],
        'class_weight' : ['balanced', None],
        'criterion' : ['entropy', 'gini'],
        'random_state': [21]
    }
    
    model_decision_tree = DecisionTreeClassifier()
    model_decision_tree = GridSearchCV(estimator=model_decision_tree, param_grid=decision_tree_grid, scoring=precision_great, cv=2, n_jobs=-1)
    model_decision_tree.fit(X_train, y_train)
    y_predict_decision_tree = model_decision_tree.predict(X_train)
    precision_decision_tree = precision_score(y_train, y_predict_decision_tree, labels=['great'], average='macro')
    print(f'Decision Tree (precision) = {precision_decision_tree}')

Decision Tree (precision) = 0.8111491829204006


## Random Forest Classifier / Классификатор случайного леса

In [39]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    decision_random_forest = {
        'n_estimators': [5, 10, 50],
        'max_depth': [i for i in range(1, 10)],
        'class_weight': ['balanced', None],
        'criterion': ['entropy', 'gini'],
        'random_state': [21]
    }
    
    model_random_forest = RandomForestClassifier()
    model_random_forest = GridSearchCV(estimator=model_random_forest, param_grid=decision_random_forest, scoring=precision_great, cv=2, n_jobs=-1)
    model_random_forest.fit(X_train, y_train)
    y_predict_random_forest = model_random_forest.predict(X_train)
    precision_random_forest = precision_score(y_train, y_predict_random_forest, labels=['great'], average='macro')
    print(f'Random Forest (precision) = {precision_random_forest}')

Random Forest (precision) = 0.8402625820568927


## Лучшая модель классификатора оказалась `Random Forest Classifier`

In [40]:
y_predict = model_random_forest.predict(X_test)
precision_best = precision_score(y_test, y_predict, labels=['great'], average='macro')
print(precision_best)

0.8356807511737089


<span style='color: #03C568; font-size: 2em;'><b>3.9. Рассмотрим различные ансамбли для алгоритмов классификации и протестируем разные параметры для них</b></span>

## VotingClassifier

In [41]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model_voting_clf = VotingClassifier(
        estimators=[('model1', model_svm), ('model2', model_decision_tree), ('model3', model_random_forest)],
        voting='hard'
    )
    model_voting_clf.fit(X_train, y_train)
    y_pred_voting = model_voting_clf.predict(X_train)
    precision_voting = precision_score(y_train, y_pred_voting, labels=['great'], average='macro')
    print(f'VotingClassifier (precision) = {precision_voting}')

VotingClassifier (precision) = 0.8155707732198657


## BaggingClassifier

In [42]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model_bagging_clf = BaggingClassifier(
        model_svm,
        n_estimators=10,
        random_state=21,
        n_jobs=-1
    )
    model_bagging_clf.fit(X_train, y_train)
    y_pred_bagging = model_bagging_clf.predict(X_train)
    precision_bagging = precision_score(y_train, y_pred_bagging, labels=['great'], average='macro')
    print(f'BaggingClassifier (precision) = {precision_bagging}')

BaggingClassifier (precision) = 0.8038542262926922


## StackingClassifier

In [43]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    estimators = [
        ('model1', model_svm),
        ('model2', model_decision_tree)
    ]
    
    model_stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        n_jobs=-1
    )
    model_stacking_clf.fit(X_train, y_train)
    y_pred_stacking = model_stacking_clf.predict(X_train)
    precision_stacking = precision_score(y_train, y_pred_stacking, labels=['great'], average='macro')
    print(f'StackingClassifier (precision) = {precision_stacking}')

StackingClassifier (precision) = 0.8030736013208866


## Лучший ансамбль для классификатора `VotingClassifier`

In [44]:
y_predict = model_voting_clf.predict(X_test)
precision_best = precision_score(y_test, y_predict, labels=['great'], average='macro')
print(precision_best)

0.8134646088344865


<span style='color: #BD2467; font-size: 3em;'><b>4. Принятие решения</b></span>

## Мы выбрали лучшей моделью `VotingClassifier`

## Сохраним её

In [45]:
best_model_file = 'best_model.joblib'
joblib.dump(model_voting_clf, best_model_file)

['best_model.joblib']

<span style='color: #BD2467; font-size: 3em;'><b>5. Факты о питании</b></span>

<span style='color: #03C568; font-size: 2em;'><b>5.1. Соберём все факты об ингредиентах</b></span>

## Определим данные с дневной нормой

In [46]:
daily_dict = {'Vitamin A': 900, 'Vitamin C': 90, 'Calcium': 1300, 'Iron': 18, 'Vitamin D': 20, 'Vitamin E': 15,
              'Vitamin K': 120, 'Thiamin': 1.2, 'Riboflavin': 1.3, 'Niacin': 16, 'Vitamin B6': 1.7, 'Folate': 400,
              'Vitamin B12': 2.4, 'Biotin': 30, 'Pantothenic acid': 5, 'Phosphorus': 1250, 'Iodine': 150,
              'Magnesium': 420, 'Zinc': 11, 'Selenium': 55, 'Copper': 0.9, 'Manganese': 2.3, 'Chromium': 35,
              'Molybdenum': 45, 'Chloride': 2300, 'Potassium':	4700, 'Choline': 550, 'Total lipid (fat)': 78,
              'Saturated fat': 20, 'Cholesterol': 300, 'Carbohydrate': 275, 'Sodium': 2300, 'Fiber': 28,
              'Protein': 50, 'Added sugars': 50}
daily_df = pd.DataFrame([daily_dict]).T.reset_index().rename(columns={'index': 'Nutrient', 0:'Daily Value'})
daily_df

Unnamed: 0,Nutrient,Daily Value
0,Vitamin A,900.0
1,Vitamin C,90.0
2,Calcium,1300.0
3,Iron,18.0
4,Vitamin D,20.0
5,Vitamin E,15.0
6,Vitamin K,120.0
7,Thiamin,1.2
8,Riboflavin,1.3
9,Niacin,16.0


## Считаем данные для каждого ингредиента

In [47]:
session = requests.Session()
session.proxies.update(proxies)
nutrition_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
nutrition_data = []

for ingredient in tqdm(list_ingredients):
    try:
        response = session.get(
            nutrition_url,
            params={"query": ingredient, "api_key": api_key},
            timeout=10
        )
        if response.status_code != 200:
            print(f"Сервер вернул {response.status_code} для {ingredient}")
            continue

        nutrition_info = response.json()
        foods = nutrition_info.get("foods", [])
        if not foods:
            print(f"Нет данных для {ingredient}")
            continue

        for food in foods:
            desc = food.get("description", "").lower()
            if ingredient.lower() not in desc:
                continue

            nutrients = food.get("foodNutrients", [])
            nutrient_dict = {"ingredient": ingredient}
            
            for nutrient in nutrients:
                name = nutrient.get("nutrientName", "").lower()
                value = nutrient.get("value")
                if value is None:
                    continue
                for _, row in daily_df.iterrows():
                    if row["Nutrient"].lower() in name:
                        try:
                            nutrient_dict[row["Nutrient"]] = round(value / row["Daily Value"] * 100)
                        except ZeroDivisionError:
                            nutrient_dict[row["Nutrient"]] = None
                        break

            nutrition_data.append(nutrient_dict)
            break 

    except Exception as e:
        continue
nutrition_df = pd.DataFrame(nutrition_data)
display(nutrition_df)

  0%|          | 0/239 [00:00<?, ?it/s]

Unnamed: 0,ingredient,Protein,Total lipid (fat),Carbohydrate,Fiber,Calcium,Iron,Potassium,Sodium,Vitamin D,...,Vitamin E,Vitamin C,Thiamin,Riboflavin,Niacin,Folate,Choline,Vitamin K,Manganese,Pantothenic acid
0,broccoli,5.0,0.0,2.0,12.0,5.0,5.0,5.0,1.0,0.0,...,,,,,,,,,,
1,margarita,0.0,0.0,6.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,,
2,sausage,21.0,22.0,4.0,5.0,2.0,20.0,,87.0,,...,,2.0,,,,,,,,
3,tarragon,0.0,0.0,0.0,,,,,0.0,,...,,,,,,,,,,
4,sesame oil,0.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,octopus,26.0,17.0,5.0,2.0,4.0,27.0,6.0,23.0,0.0,...,0.0,3.0,14.0,8.0,16.0,12.0,10.0,10.0,,
235,tomato,,,,,,,,,,...,,,,,,,,,,
236,salad dressing,0.0,43.0,1.0,0.0,0.0,0.0,,38.0,,...,,9.0,,,,,,,,
237,salad,5.0,6.0,10.0,9.0,5.0,5.0,,8.0,,...,,44.0,,,,,,,,


<span style='color: #03C568; font-size: 2em;'><b>5.2. Сохраним все преобразованные данные в файл</b></span>

In [48]:
nutrition_df.to_csv("nutrition_facts.csv", index=False)

<span style='color: #BD2467; font-size: 3em;'><b>6. Похожие рецепты</b></span>

<span style='color: #03C568; font-size: 2em;'><b>6.1. Найдём все ссылки на блюда на сайте `www.epicurious.com`</b></span>

In [49]:
recipe_urls = []
input_file_epi_r = 'epi_r.csv'
df = pd.read_csv(input_file_epi_r)
for _, row in df.iterrows():
    try:
        title = row.get("title")
        rating = row.get("rating", None)

        if pd.notna(title):
            clean_title = str(title).replace(' ', '-').replace(',', '').replace('"', '').lower()
            url = f"https://www.epicurious.com/recipes/food/views/{clean_title}"
            recipe_urls.append({
                "title": str(title),
                "rating": rating,
                "url": url
            })
    except Exception as e:
        continue
recipe_urls_df = pd.DataFrame(recipe_urls)
display(recipe_urls_df)

Unnamed: 0,title,rating,url
0,"Lentil, Apple, and Turkey Wrap",2.500,https://www.epicurious.com/recipes/food/views/...
1,Boudin Blanc Terrine with Red Onion Confit,4.375,https://www.epicurious.com/recipes/food/views/...
2,Potato and Fennel Soup Hodge,3.750,https://www.epicurious.com/recipes/food/views/...
3,Mahi-Mahi in Tomato Olive Sauce,5.000,https://www.epicurious.com/recipes/food/views/...
4,Spinach Noodle Casserole,3.125,https://www.epicurious.com/recipes/food/views/...
...,...,...,...
20047,Parmesan Puffs,3.125,https://www.epicurious.com/recipes/food/views/...
20048,Artichoke and Parmesan Risotto,4.375,https://www.epicurious.com/recipes/food/views/...
20049,Turkey Cream Puff Pie,4.375,https://www.epicurious.com/recipes/food/views/...
20050,Snapper on Angel Hair with Citrus Cream,4.375,https://www.epicurious.com/recipes/food/views/...


<span style='color: #03C568; font-size: 2em;'><b>6.2. Сохраним эти ссылки на блюда в файл</b></span>

In [50]:
recipe_urls_df.to_csv("similar_recipes.csv", index=False)