In [1]:
from neo4j import GraphDatabase
import pandas as pd
from openai import OpenAI
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import json
from collections import Counter
import mysql.connector
import os
import re
from dotenv import load_dotenv
load_dotenv()
from bs4 import BeautifulSoup
import requests
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

In [2]:
# ChatGPTを使うのでAPIキーを設定
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
gpt_client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [3]:
# DB接続処理
# Neo4jに接続
uri = "bolt://localhost:7687"
user = "neo4j"
password = "abcd7890"

# ドライバを作成
driver = GraphDatabase.driver(uri, auth=(user, password))

In [4]:
# ChatGPTを使用して意味のある文章を生成
def get_gpt_result(prompt):
    # Replace `gpt_client` with proper OpenAI API calls
    # Adjust this based on how your API client is initialized
    llm = ChatOpenAI(model="gpt-4")  # Or "gpt-3.5-turbo"
    return llm.predict(prompt).strip()

# 作業時間を分（min）に揃える
def extract_minutes(text):
    prompt = f"""
Convert a given total_time string (e.g., "1 hr 45 min") into the total time in minutes. Ensure to handle cases with both "hr" and "min" or only "hr" or "min". Return the result as an integer representing the total minutes.

Input:
{text}

Output:
"""
    return int(get_gpt_result(prompt))


# 作業時間を材料から推測する
def estimate_time_from_title_and_ingredients(title, ingredients):
    """
    Estimate total time required based on title and ingredients.
    """
    prompt = f"""
Estimate the total time required to complete a task based on its title and detailed instructions. 
Consider the complexity, number of steps, and common preparation time required.
Return the total time as an integer value representing the number of minutes.

Input:
title: {title}
ingredients: {', '.join(ingredients)}

Output:
"""
    return int(get_gpt_result(prompt))

# 作業時間を作業手順から推測する
def estimate_time_from_title_and_instructions(title, instructions):
    """
    Estimate total time required based on title and instructions.
    """
    prompt = f"""
Estimate the total time required to complete a task based on its title and detailed instructions. 
Consider the complexity, number of steps, and common preparation time required.
Return the total time as an integer value representing the number of minutes.

Input:
title: {title}
instructions: {instructions}

Output:
"""
    return int(get_gpt_result(prompt))

def fetch_webpage_content(url):
    """
    Fetch HTML content from the URL and extract visible text.
    """
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    text = soup.get_text(separator="\n")
    return text

# webページから情報を抽出する

# 食材名をURLから抽出する
def extract_ingredient_names_from_list(ingredients):
    # リストをカンマ区切りの文字列にフォーマット
    formatted_string = ", ".join(ingredients)
    # フォーマットされた文字列を別の関数に渡す
    return extract_ingredient_names_from_text(formatted_string)

# 食材名をURLから抽出する
def extract_ingredient_names_from_text(content):
    """
    Use LangChain to extract ingredient names from the given URL's content.
    """
    
    # Step 2: Define the prompt template
    prompt_template = """
    I want you to act as an ingredient extractor. Given the text content of a webpage, 
    extract and return a clean, deduplicated array of ingredient names mentioned in the content. 
    Do not include unrelated text or additional information. 
    Extract the ingredient names from the text below. Follow these rules:
    1. Remove any quantities, units, or modifiers (e.g., "2 cups", "1/2 tsp").
    2. Simplify descriptive phrases by removing extra adjectives or specific modifiers. For example:
      - "Extra-virgin olive oil" → "olive oil"
      - "Freshly ground black pepper" → "pepper"
      - "Dark brown sugar" → "sugar"
    3. Convert multiple forms of the same ingredient into its singular, generalized form (e.g., "apples" → "apple").
    4. Replace specific brand names, product names, or unique ingredient names with simple, general terms (e.g., "Epsom salt" → "salt").
    5. Ensure that the extracted ingredients are clean, deduplicated, and in singular form.
    6. Standardize formatting for compound ingredients by adding proper spacing after commas. For example:
      - "Salt, Pepper" → "salt", "pepper"
    7. Convert subtypes of ingredients into their base types. For example:
      - "lemon juice" → "lemon"
      - "chicken broth" → "chicken"
      - "almond milk" → "almond"
    If no ingredients are found, return an empty array.

    Here is the text content: 
    {content}
    
    Output:
    """

    # Step 3: Use ChatGPT to extract ingredients
    prompt = prompt_template.format(content=content)
    result = get_gpt_result(prompt)
    ingredient = json.loads(result) or []
    return ingredient


# インストラクションを抽出する
def extract_instructions_from_webpage(content):
    """
    Extract cooking instructions from a webpage's content and return them as a JSON array 
    in the format ["instruction", "instruction", ...].
    """
    prompt = f"""
I want you to act as a cooking instruction extractor. Your task is to extract the detailed steps or instructions for a recipe from the provided text content of a webpage. The extracted instructions should be:

1. Returned as a JSON array, where each element is a string (e.g., "Preheat the oven to 350°F").
2. Exclude any ingredients, introductions, or unrelated information (e.g., advertisements or nutritional facts).
3. Focus only on clear, actionable cooking steps (e.g., "Preheat the oven to 350°F" or "Chop the onions and sauté them until golden brown").
4. Remove "Step 1:" or similar prefixes from the instructions.
5. If no cooking instructions are found, return an empty JSON array ([]).

Here is the content of the webpage:
{content}

Output:
"""
    result = get_gpt_result(prompt)
    return json.loads(result)

In [5]:
# テキストフォーマットの指定
def format_text(text):
    if text is None:
        return ""
    text = text.replace('(', '_')
    text = text.replace(')', '_')
    text = text.replace("/", '_')
    text = text.replace(";", '_')
    text = text.replace(":", '_')
    text = text.replace("&", '_')
    text = text.replace("[", '')
    text = text.replace("]", '')    
    text = text.replace('"', '')
    text = text.replace('<', '')
    text = text.replace('>', '')
    text = text.replace(', ', ',')
    text = text.replace('.', '')
    text = text.replace(',', '_')
    text = text.replace('-', ' ')
    text = text.replace(' ', '_')
    text = text.replace('\n', '')
    text = text.replace('%', '')
    text = text.replace("'", '')
    text = text.replace("/t", '')
    text = text.replace("\\", '')
    text = text.replace("é", '')
    text = text.replace("ç", '')
    text = text.replace("+", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("ã", '')
    text = text.replace("â", '')
    text = text.replace("ƒ", '')
    text = text.replace("€", '')
    text = text.replace('”', '')
    text = text.replace('`', '')
    text = text.replace('!', '')
    text = text.replace('|', '')
    text = text.replace("\n", "")
    text = text.replace("\r", "")
    text = re.sub(r'\s+', '', text)
    text = text.lower()
    return text

def escape_sql_string(text):
    text = text.rstrip("\\")
    return text.replace("'", "''")

def clean_title(text):
    text = text.replace("ã", '')
    text = text.replace("â", '')
    text = text.replace("ƒ", '')
    text = text.replace("€", '')
    text = text.replace("é", '')
    text = text.replace("ç", '')
    # UTF-8にエンコードし、デコード時にエラーを無視して文字化けを削除
    text = text.encode('utf-8', 'replace').decode('utf-8', 'ignore')
    return text

In [6]:
# トークン化して保存する
def tokenize_save(key_name, sentences):
    # NumPyの配列に変換
    tokens = []
    for sentence in sentences:
        if sentence is None:
            continue
        
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
            
        # 小文字化し、特殊文字や句読点を削除してトークン化
        words = re.findall(r'\b[a-zA-Z0-9_]+\b', sentence.lower())
        tokens.append(words)
        
    np_sentences = np.array(tokens, dtype=object)

    # NumPyの配列をdumpして保存
    np.save(f"../datas/word2_vec/{key_name}.npy", np_sentences)
    with open(f"../datas/word2_vec/{key_name}.txt", 'w') as f:
        json.dump(tokens, f)

In [7]:
def find_food(tx, search_term):
    find_food_query = """
    CALL db.index.fulltext.queryNodes("food_sub_index_text_search", $search_term)
    YIELD node, score
    ORDER BY score DESC, size(node.name)
    LIMIT 1
    RETURN node.id as node_id, node.name as node_name, node.flavor_vector as flavor_vector, node.word_vector as word_vector, score    
    """
    return tx.run(find_food_query, search_term=search_term).data()

In [8]:
JSON_OUTPUT_PATH = "../data/formatted_json_recipe.json"
def get_json_files(file):
    json_files = os.listdir(file)
    return json_files

In [9]:
unique_recipes = {}

for file in get_json_files("../data/json_recipes/"):
    with open(f"../data/json_recipes/{file}") as f:
        json_data = json.load(f)
        if not 'recipes_results' in json_data or json_data['recipes_results'] is None:
            continue
        
        for item in json_data["recipes_results"]:
            unique_recipes[item['title']] = item

for file2 in get_json_files("../data/html_recipes/out_jsons/"):
    with open(f"../data/html_recipes/out_jsons/{file2}") as f:
        json_data = json.load(f)
        for item in json_data:
            if not 'title' in item or item['ingredients'] == "None" or item['ingredients'] == []:
                continue 
            json_input = {
                "title": item["title"],
                "ingredients": item['ingredients'],
                "instructions": item['instructions'],
            }
            unique_recipes[item['title']] = json_input

print(len(unique_recipes))

1388


In [10]:
recipes = {}
OUTPUT_FILE_NAME = "../data/formatted_json_recipe.json"
already_processed = []
with open(OUTPUT_FILE_NAME, 'r') as f2:
    recipes = json.load(f2)
    already_processed =recipes.keys()

In [11]:
for key, item in unique_recipes.items():
    if not 'ingredients' in item or item['ingredients'] is None:
        continue
    
    title = clean_title(item['title'])
    if title in already_processed:
        continue
    
    extracted_ingredient = []
    extracted_instructions = []
    if 'link' in item:
        try:
            content = fetch_webpage_content(item["link"])
            extracted_ingredient =  extract_ingredient_names_from_text(content)
            extracted_instructions = extract_instructions_from_webpage(content)
        except requests.exceptions.HTTPError as e:
            pass
        except Exception as e:
            print(e)
            pass
    
    ingredient = extract_ingredient_names_from_list(item['ingredients'])
    
    sanitized_ingredients = list(set(extracted_ingredient + ingredient))
    
    instructions = []
    if 'instructions' in item:
        instructions = item['instructions']
    else:
        instructions = extracted_instructions
    
    total_time = ""
    if "total_time" in item:
        total_time = extract_minutes(item['total_time'])
    elif 'instructions' in item:
        total_time = estimate_time_from_title_and_instructions(item['title'], sanitized_ingredients)
    else:
        total_time = estimate_time_from_title_and_ingredients(item['title'], sanitized_ingredients)
    
    rating = ""
    if "rating" in item:
        rating = item['rating']
    
    reviews = ""
    if "reviews" in item:
        reviews = item['reviews']
    else:
        reviews = 0
        
    json_input = {
        "name": title,
        "rating": rating,
        "reviews": reviews,
        "total_time": total_time,
        "ingredients": sanitized_ingredients,
        "instructions": instructions
    }
    recipes[title] = json_input

  llm = ChatOpenAI(model="gpt-4")  # Or "gpt-3.5-turbo"
  return llm.predict(prompt).strip()


In [12]:
with open(OUTPUT_FILE_NAME, 'w') as f2:
    json.dump(recipes, f2, indent=4)