In [1]:
import lightgbm as lgbm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import numpy as np
import joblib


In [2]:
recipes_df = pd.read_parquet("~/resources/food/recipes.parquet")
user_df = pd.read_parquet("~/resources/food/user.parquet")
folder_df = pd.read_parquet("~/resources/food/folder.parquet")
bookmark_df = pd.read_parquet("~/resources/food/bookmark.parquet")


In [3]:
keywords_df = pd.DataFrame.sparse.from_spmatrix(
    sparse.csr_array(
        pd.get_dummies(recipes_df["Keywords"].explode())
        .reset_index()
        .groupby("index")
        .sum()
        .values
    ),
    columns=np.unique(recipes_df["Keywords"].explode().dropna(how="any")),
)
keywords_df


Unnamed: 0,< 15 Mins,< 30 Mins,< 4 Hours,< 60 Mins,African,Apple,Artichoke,Asian,Australian,Austrian,...,Welsh,White Rice,Whitefish,Whole Chicken,Whole Duck,Whole Turkey,Wild Game,Winter,Yam/Sweet Potato,Yeast Breads
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522512,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522513,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522514,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522515,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
texts_df = recipes_df[["Name"]].fillna("").agg(" ".join, axis=1)
texts_df



0                        Low-Fat Berry Blue Frozen Dessert
1                                                  Biryani
2                                            Best Lemonade
3                           Carina's Tofu-Vegetable Kebabs
4                                             Cabbage Soup
                                ...                       
522512                      Meg's Fresh Ginger Gingerbread
522513    Roast Prime Rib au Poivre with Mixed Peppercorns
522514                               Kirshwasser Ice Cream
522515            Quick & Easy Asian Cucumber Salmon Rolls
522516                             Spicy Baked Scotch Eggs
Length: 522517, dtype: object

In [5]:
tfidf = TfidfVectorizer(stop_words="english")
X = tfidf.fit_transform(texts_df)
X


<522517x44777 sparse matrix of type '<class 'numpy.float64'>'
	with 1911044 stored elements in Compressed Sparse Row format>

In [6]:
tfidf.get_feature_names_out()


array(['00', '000', '001', ..., 'çº', 'œè', 'šéª'], dtype=object)

In [7]:
sparse.save_npz("/home/amogus/resources/food/tfidf.npz", X)
joblib.dump(tfidf.get_feature_names_out(), "/home/amogus/resources/food/tfidf_columns.joblib")


['/home/amogus/resources/food/tfidf_columns.joblib']

In [8]:
tfidf_df = pd.DataFrame.sparse.from_spmatrix(
    X,
    columns=tfidf.get_feature_names_out(),
)
tfidf_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Columns: 44777 entries, 00 to šéª
dtypes: Sparse[float64, 0](44777)
memory usage: 21.9 MB


In [None]:
sparse.save_npz(
    "/home/amogus/resources/food/keywords_df.npz", keywords_df.sparse.to_coo()
)


Unnamed: 0,RecipeId,AggregatedRating,ReviewCount,< 15 Mins,< 30 Mins,< 4 Hours,< 60 Mins,African,Apple,Artichoke,...,zzzingers,½fiï,½s,½ï,ä½,å¹,æœ,çº,œè,šéª
0,38.0,4.5,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,39.0,3.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,40.0,4.5,10.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,41.0,4.5,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,42.0,4.5,11.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522512,541379.0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522513,541380.0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522514,541381.0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522515,541382.0,0.0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
recipes_numerical = pd.concat(
    [
        recipes_df[["RecipeId", "AggregatedRating", "ReviewCount"]].fillna(0.0),
        pd.concat([keywords_df, tfidf_df], axis=1),
    ],
    axis=1,
)
feature_columns = recipes_numerical.columns
feature_columns = feature_columns.delete(0)
recipes_numerical


Unnamed: 0,RecipeId,AggregatedRating,ReviewCount,< 15 Mins,< 30 Mins,< 4 Hours,< 60 Mins,African,Apple,Artichoke,...,zzzingers,½fiï,½s,½ï,ä½,å¹,æœ,çº,œè,šéª
0,38.0,4.5,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,39.0,3.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,40.0,4.5,10.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,41.0,4.5,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,42.0,4.5,11.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522512,541379.0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522513,541380.0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522514,541381.0,0.0,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522515,541382.0,0.0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
folders_recipes_df = pd.merge(bookmark_df.sort_values(["folder_index", "rating"]), recipes_numerical, how="inner", left_on="RecipeId", right_on="RecipeId",suffixes=["_bookmark", None])
folders_recipes_df[feature_columns]


Unnamed: 0,AggregatedRating,ReviewCount,< 15 Mins,< 30 Mins,< 4 Hours,< 60 Mins,African,Apple,Artichoke,Asian,...,zzzingers,½fiï,½s,½ï,ä½,å¹,æœ,çº,œè,šéª
0,4.0,3.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,318.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.5,252.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5.0,1692.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.0,347.0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,5.0,1410.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5.0,3.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,4.5,1657.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,5.0,8.0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,5.0,3063.0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [26]:
target_column = ["rating"]


In [27]:
groups = folders_recipes_df["folder_index"].value_counts().sort_index().to_list()
groups


[5, 6, 3, 1, 2, 4, 1, 1, 1, 1, 4, 1, 3, 2]

In [28]:
ranker = lgbm.LGBMRanker(min_child_samples=1, num_leaves=63, learning_rate=0.01, n_estimators=1000, verbosity=-1)
ranker.fit(folders_recipes_df[feature_columns], folders_recipes_df[target_column], group=groups)


In [25]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 7))
df_plt = pd.DataFrame({'feature_name': feature_columns, 'feature_importance':
ranker.feature_importances_})
df_plt.sort_values('feature_importance', ascending=False, inplace=True)
sns.barplot(x="feature_importance", y="feature_name", data=df_plt)
plt.title('feature importance')


KeyboardInterrupt: 

In [29]:
def predict_from_folder_index(index: int):
    folder_keywords = np.unique(
        pd.merge(
            bookmark_df[bookmark_df["folder_index"] == index],
            recipes_df,
            how="inner",
            left_on="RecipeId",
            right_on="RecipeId",
        )["Keywords"].explode()
    )
    exclude_kw = list(set(keywords_df.columns).difference(folder_keywords))
    pred_df = recipes_numerical.copy()
    pred_df = pred_df.loc[pred_df[exclude_kw].values.sum(axis=1) == 0]
    preds = ranker.predict(pred_df[feature_columns])
    return recipes_df.iloc[
        pd.concat(
            [pred_df.reset_index(), pd.Series(preds).to_frame("rank_score")], axis=1
        )
        .sort_values("rank_score", ascending=False)
        .head(9)
        .sample(frac=1)
        .head(6)
        .set_index("index")
        .index
    ]


In [None]:
def predict_from_user(user: str):
    folder_keywords = np.unique(
        pd.merge(
            bookmark_df[
                bookmark_df["folder_index"].isin(
                    folder_df[folder_df["username"] == user].index
                )
            ],
            recipes_df,
            how="inner",
            left_on="RecipeId",
            right_on="RecipeId",
        )["Keywords"].explode()
    )
    exclude_kw = list(set(keywords_df.columns).difference(folder_keywords))
    pred_df = recipes_numerical.copy()
    pred_df = pred_df.loc[pred_df[exclude_kw].values.sum(axis=1) == 0]
    preds = ranker.predict(pred_df[feature_columns])
    return recipes_df.iloc[
        pd.concat(
            [pred_df.reset_index(), pd.Series(preds).to_frame("rank_score")], axis=1
        )
        .sort_values("rank_score", ascending=False)
        .head(9)
        .sample(frac=1)
        .head(6)
        .set_index("index")
        .index
    ]


In [None]:
def gen_recommendations(user: str):
    recommend_result = dict()
    if(not folder_df[folder_df["username"] == user].empty):
        recommend_result["recommend_from_summary"] = {
            "results": predict_from_user(user).fillna(np.nan).replace([np.nan], [None]).to_dict(orient="records")
        }
        for i, outt in enumerate(recommend_result["recommend_from_summary"]["results"]):
            for k, v in outt.items():
                if isinstance(v, np.ndarray):
                    recommend_result["recommend_from_summary"]["results"][i][k] = list(v)
        folder = folder_df[folder_df["username"] == user].sample(frac=1).head(1)
        recommend_result["recommend_from_folder"] = {
            "folder_name": folder.iloc[0]["folder_name"],
            "results": predict_from_folder_index(folder.index[0]).fillna(np.nan).replace([np.nan], [None]).to_dict(orient="records")
        }
        for i, outt in enumerate(recommend_result["recommend_from_folder"]["results"]):
            for k, v in outt.items():
                if isinstance(v, np.ndarray):
                    recommend_result["recommend_from_folder"]["results"][i][k] = list(v)
        recommend_result["recommend_from_random"] = {
            "results": recipes_df.sample(6).fillna(np.nan).replace([np.nan], [None]).to_dict(orient="records")
        }
        for i, outt in enumerate(recommend_result["recommend_from_random"]["results"]):
            for k, v in outt.items():
                if isinstance(v, np.ndarray):
                    recommend_result["recommend_from_random"]["results"][i][k] = list(v)
    return recommend_result


In [None]:
user = "test5"
recommend_result = dict()
if not folder_df[folder_df["username"] == user].empty:
    if pd.merge(
        folder_df[folder_df["username"] == user],
        bookmark_df,
        left_index=True,
        right_on="folder_index",
        how="inner",
    ).empty:
        recommend_result["recommend_from_summary"] = {"results": []}
    else:
        recommend_result["recommend_from_summary"] = {
            "results": predict_from_user(user)
            .fillna(np.nan)
            .replace([np.nan], [None])
            .to_dict(orient="records")
        }
        for i, outt in enumerate(recommend_result["recommend_from_summary"]["results"]):
            for k, v in outt.items():
                if isinstance(v, np.ndarray):
                    recommend_result["recommend_from_summary"]["results"][i][k] = list(
                        v
                    )
    folder = folder_df[folder_df["username"] == user].sample(frac=1).head(1)
    if pd.merge(
        folder, bookmark_df, left_index=True, right_on="folder_index", how="inner"
    ).empty:
        recommend_result["recommend_from_folder"] = {
            "folder_name": folder.iloc[0]["folder_name"],
            "results": [],
        }
    else:
        recommend_result["recommend_from_folder"] = {
            "folder_name": folder.iloc[0]["folder_name"],
            "results": predict_from_folder_index(folder.index[0])
            .fillna(np.nan)
            .replace([np.nan], [None])
            .to_dict(orient="records"),
        }
        for i, outt in enumerate(recommend_result["recommend_from_folder"]["results"]):
            for k, v in outt.items():
                if isinstance(v, np.ndarray):
                    recommend_result["recommend_from_folder"]["results"][i][k] = list(v)
    recommend_result["recommend_from_random"] = {
        "results": recipes_df.sample(6)
        .fillna(np.nan)
        .replace([np.nan], [None])
        .to_dict(orient="records")
    }
    for i, outt in enumerate(recommend_result["recommend_from_random"]["results"]):
        for k, v in outt.items():
            if isinstance(v, np.ndarray):
                recommend_result["recommend_from_random"]["results"][i][k] = list(v)


Empty DataFrame
Columns: [username, folder_name, folder_index, RecipeId, rating]
Index: []


In [30]:
out = predict_from_folder_index(5)
out
