In [None]:
import pandas as pd
import os

data_path = '../../data/jeehoshin/foodcom_dataset/'
raw_interaction = pd.read_csv(data_path + 'RAW_interactions.csv')
pp_recipe = pd.read_csv(data_path + 'PP_recipes.csv')
raw_recipe = pd.read_csv(data_path + 'RAW_recipes.csv')

records_removed = True
df = raw_interaction[raw_interaction['recipe_id'].isin(pp_recipe['id'].tolist())]
while records_removed:
    user_counts = df['user_id'].value_counts()
    item_counts = df['recipe_id'].value_counts()

    valid_users = user_counts[user_counts >= 5].index
    valid_items = item_counts[item_counts >= 5].index

    filtered_df = df[df['user_id'].isin(valid_users) & df['recipe_id'].isin(valid_items)]

    if len(filtered_df) == len(df):
        records_removed = False
    else:
        records_removed = True

    df = filtered_df
    
core_inter = df
sort_filter_ui = core_inter.sort_values('date').reset_index(drop=True)

file_names = os.listdir(data_path + 'images/')

finish_image = [int(os.path.splitext(file_name)[0]) for file_name in file_names]
sort_filter_ui = sort_filter_ui[sort_filter_ui['recipe_id'].isin(finish_image)].reset_index(drop=True)
print(len(finish_image))
sort_filter_ui

29943


Unnamed: 0,user_id,recipe_id,date,rating,review
0,2999,3567,2000-10-23,5,I have made this pie instead of plain ol' pump...
1,2178,3704,2000-10-30,3,Careful not to cook it too long... you want th...
2,2178,4366,2000-11-04,5,"if you like oysters, this is a great alternati..."
3,5523,7695,2001-02-01,1,I agree.
4,42189,4460,2001-02-12,5,I have had this before. It has really good fla...
...,...,...,...,...,...
428496,2001513060,367414,2018-12-17,1,"maybe I did something wrong , but I thought th..."
428497,2001513060,192495,2018-12-17,5,This is a keeper. Delicious Soup that both my ...
428498,454804,20713,2018-12-17,0,"Made this as gifts. Did 6, quart jars plus had..."
428499,1290903,131607,2018-12-18,5,This is a great recipe for a nice thin crispy ...


In [None]:
print(f"Number of recipes with food photos: {len(file_names)}")

# user preprocessing before
users = sort_filter_ui['user_id'].unique().tolist()
items = sort_filter_ui['recipe_id'].unique().tolist()
print(f"number of users: {len(users)}, number of items: {len(items)}")

# Use only users who appear in both train and test
train_ = sort_filter_ui[:int(0.6*len(sort_filter_ui))]
valid_ = sort_filter_ui[int(0.6*len(sort_filter_ui)):int(0.7*len(sort_filter_ui))]
test_ = sort_filter_ui[int(0.7*len(sort_filter_ui)):]

u_tr = set(train_['user_id'].tolist())
u_va = set(valid_['user_id'].tolist())
u_te = set(test_['user_id'].tolist())
u_total = u_tr & u_te

filter_u_tr = train_[train_['user_id'].isin(u_total)].reset_index(drop=True)
filter_u_te = test_[test_['user_id'].isin(u_total)].reset_index(drop=True)
filter_u_va = valid_[valid_['user_id'].isin(u_total)].reset_index(drop=True)
print(f"train interaction count: {len(filter_u_tr)}, valid interaction count: {len(filter_u_va)}, test interaction count: {len(filter_u_te)}")

u_train = set(filter_u_tr['user_id'].tolist())
u_test = set(filter_u_te['user_id'].tolist())
u_valid = set(filter_u_va['user_id'].tolist())
print(f"train user count: {len(u_train)}, valid user count: {len(u_valid)}, test user count: {len(u_test)}")

i_tr = set(filter_u_tr['recipe_id'].tolist())
i_te = set(filter_u_te['recipe_id'].tolist())
i_va = set(filter_u_va['recipe_id'].tolist())
print(f"train item count: {len(i_tr)}, valid item count: {len(i_va)}, test item count: {len(i_te)}")

i_total = i_tr|i_va|i_te
print(f"Number of users to use: {len(u_total)}")
print(f"Number of items to use: {len(i_total)}")

In [None]:
train_interaction = filter_u_tr
test_interaction = filter_u_te
valid_interaction = filter_u_va
print(f"train interaction count : {len(train_interaction)}, valid interaction count : {len(valid_interaction)}, test interaction count : {len(test_interaction)}")
print(train_interaction.columns)
print(test_interaction.columns)
print(valid_interaction.columns)

recipe_processed = raw_recipe[raw_recipe['id'].isin(i_total)].reset_index(drop=True)
print(f"Number of recipes to use: {len(recipe_processed)}")
print(recipe_processed.columns)

In [4]:
from sklearn.preprocessing import LabelEncoder

user_ids = list(u_total)
user_encoder = LabelEncoder().fit(user_ids)

user_to_idx = {v: i for i, v in enumerate(user_encoder.classes_)}
idx_to_user = {i: v for i, v in enumerate(user_encoder.classes_)}

item_ids = list(i_total)
item_encoder = LabelEncoder().fit(item_ids)

item_to_idx = {v: i for i, v in enumerate(item_encoder.classes_)}
idx_to_item = {i: v for i, v in enumerate(item_encoder.classes_)}

In [None]:
interaction1 = train_interaction.copy()
interaction1['userID'] = interaction1['user_id'].map(user_to_idx)
interaction1['itemID'] = interaction1['recipe_id'].map(item_to_idx)
interaction1 = interaction1.rename(columns={'date':'timestamp'})
interaction1 = interaction1[['userID', 'itemID', 'rating', 'timestamp']]
interaction1['x_label'] = 0

interaction2 = valid_interaction.copy()
interaction2['userID'] = interaction2['user_id'].map(user_to_idx)
interaction2['itemID'] = interaction2['recipe_id'].map(item_to_idx)
interaction2 = interaction2.rename(columns={'date':'timestamp'})
interaction2 = interaction2[['userID', 'itemID', 'rating', 'timestamp']]
interaction2['x_label'] = 1

interaction3 = test_interaction.copy()
interaction3['userID'] = interaction3['user_id'].map(user_to_idx)
interaction3['itemID'] = interaction3['recipe_id'].map(item_to_idx)
interaction3 = interaction3.rename(columns={'date':'timestamp'})
interaction3 = interaction3[['userID', 'itemID', 'rating', 'timestamp']]
interaction3['x_label'] = 2

final_interaction = pd.concat([interaction1, interaction2, interaction3], axis=0).reset_index(drop=True)

print(final_interaction)
print()

final_interaction.to_csv(data_path + 'foodcom.inter', sep='\t', index=False)

user_mapping = pd.DataFrame(list(user_to_idx.items()), columns=['user_id', 'userID'])
print(user_mapping)
print()
user_mapping.to_csv(data_path + 'u_id_mapping.csv', sep='\t', index=False)

item_mapping = pd.DataFrame(list(item_to_idx.items()), columns=['recipe_id', 'itemID'])
print(item_mapping)
print()
item_mapping.to_csv(data_path + 'i_id_mapping.csv', sep='\t', index=False)

        userID  itemID  rating   timestamp  x_label
0            6     175       5  2000-10-23        0
1           25     445       4  2001-03-24        0
2           24      28       3  2001-04-02        0
3           21     353       4  2001-05-07        0
4           32     477       4  2001-05-16        0
...        ...     ...     ...         ...      ...
322541    2585   11188       5  2018-12-16        2
322542    1682   23413       5  2018-12-16        2
322543    3319   25836       5  2018-12-16        2
322544    4851    1834       0  2018-12-17        2
322545    2694   26926       5  2018-12-18        2

[322546 rows x 5 columns]

      user_id  userID
0        1535       0
1        1634       1
2        1676       2
3        1891       3
4        2586       4
...       ...     ...
7580  1177498    7580
7581  1182649    7581
7582  1185443    7582
7583  1185804    7583
7584  1186221    7584

[7585 rows x 2 columns]

       recipe_id  itemID
0             40       0
1       