In [None]:
import pandas as pd

data_path = "../../data/jeehoshin/allrecipe_dataset/"
train = pd.read_csv(data_path + "core-data-train_rating.csv")
valid = pd.read_csv(data_path + "core-data-valid_rating.csv")
test = pd.read_csv(data_path + "core-data-test_rating.csv")

# add x_label column to interactions
train["x_label"] = 0
valid["x_label"] = 1
test["x_label"] = 2

interactions = pd.concat([train, valid, test])

user_ids = interactions['user_id'].unique()
uid_map = {uid: i for i, uid in enumerate(user_ids)}

recipe_ids = interactions['recipe_id'].unique()
rid_map = {rid: i for i, rid in enumerate(recipe_ids)}
rid_map2 = {i: rid for i, rid in enumerate(recipe_ids)}

interactions['user_id'] = interactions['user_id'].map(uid_map)
interactions['recipe_id'] = interactions['recipe_id'].map(rid_map)

# change column name
interactions.rename(columns={'dateLastModified': 'timestamp'}, inplace=True)
interactions.rename(columns={'user_id': 'userID'}, inplace=True)
interactions.rename(columns={'recipe_id': 'itemID'}, inplace=True)

interactions.to_csv(data_path + 'allrecipe.inter',sep='\t', index=False)
print('allrecipe.inter created')

# create user ID mapping
user_id_mapping = pd.DataFrame(list(uid_map.items()), columns=['user_id', 'userID'])
user_id_mapping.to_csv(data_path + 'u_id_mapping.csv', sep='\t', index=False)

print('u_id_mapping.csv created')

# create item ID mapping
item_id_mapping = pd.DataFrame(list(rid_map.items()), columns=['recipe_id', 'itemID'])
item_id_mapping.to_csv(data_path + 'i_id_mapping.csv', sep='\t', index=False)

print('i_id_mapping.csv created')

In [None]:
import pandas as pd
import ast
from collections import defaultdict
from datetime import datetime

user_reviews = defaultdict(dict)
dates = defaultdict(dict)

data_path = "../../data/jeehoshin/allrecipe_dataset/"

raw_recipes = pd.read_csv(data_path + "raw-data_recipe.csv")
train_interaction = pd.read_csv(data_path + "core-data-train_rating.csv")

interactions = train_interaction
user_ids = interactions['user_id'].unique().tolist()
recipe_ids = interactions['recipe_id'].unique().tolist()

print(f"Total users : {len(user_ids)}")
print(f"Total recipes : {len(recipe_ids)}")
print(f"Total interactions : {len(interactions)}")

recipes = raw_recipes[['recipe_id', 'reviews']]

# {'user_id': {'recipe_id': 'dateLastModified', 'recipe_id': 'dateLastModified', ...}}
for i in range(len(interactions)):
    user_id = interactions.iloc[i]['user_id']
    recipe_id = interactions.iloc[i]['recipe_id']
    date = interactions.iloc[i]['dateLastModified']
    dates[user_id][recipe_id] = date

for user_id in dates:
    dict_dates = dates[user_id]
    sorted_dates = sorted(dict_dates.items(), key=lambda x: datetime.fromisoformat(x[1].strip()), reverse=True)
    if len(sorted_dates) > 10:
        sorted_dates = sorted_dates[:10]
    dates[user_id] = {k: v for k, v in sorted_dates}

count = 0
for user_id in dates:
    count += len(dates[user_id])

print(f"Processed interactions : {count}")

for i in range(len(recipes)):
    review = ast.literal_eval(recipes.iloc[i]['reviews'])
    recipe_id = recipes.iloc[i]['recipe_id']
    for user_id in review:
        if user_id in user_ids and recipe_id in dates[user_id]:
            user_reviews[user_id][recipe_id] = review[user_id]['text']

df = pd.DataFrame({
    'user_id': list(user_reviews.keys()),
    'reviews': list(user_reviews.values())
})

df.to_csv(data_path + "user_reviews_o.csv", index=False)
print("Saved user_id and reviews to 'user_reviews_o.csv'")

user_review = pd.read_csv(data_path + "user_reviews_o.csv")
print(len(user_review))

Total users : 68768
Total recipes : 29093
Total interactions : 676946
Processed interactions : 303581
Saved user_id and reviews to 'user_reviews_o.csv'
68768


In [None]:
import re

users = pd.read_csv(data_path + "user_reviews_o.csv")

for i in range(len(users)):
    review = users.iloc[i]['reviews']
    review_cleaned = re.sub(r'np\.int64\((\d+)\)', r'\1', review)
    users.at[i, 'reviews'] = review_cleaned

users.to_csv(data_path + "user_reviews_train_interactions.csv", index=False)
print("Cleaned user reviews saved to 'user_reviews_train_interactions.csv'")

Cleaned user reviews saved to 'user_reviews_train_interactions.csv'


In [None]:
import pandas as pd
users = pd.read_csv(data_path + "user_reviews_train_interactions.csv")

count = 0
print(users.head())
print(len(users))
for i in range(len(users)):
    review = users.iloc[i]["reviews"]
    review_dict = ast.literal_eval(review)
    
    count += len(review_dict)
print(count / len(users))

   user_id                                            reviews
0  2783111  {218939: "i enjoyed this didn't discard the ti...
1  5404163  {87211: 'Mixed the pesto with mayo, used deli ...
2   702483  {87211: "I made this on my foreman grill and i...
3  3471401  {87211: "Really simple!   I used French bread ...
4  1605138  {87211: "Who needs Atlanta Bread with a recipe...
68768
4.414567822242904
