In [28]:
from neo4j import GraphDatabase
import pandas as pd
from openai import OpenAI
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import json
from collections import Counter
import mysql.connector
import os
import re
from dotenv import load_dotenv
load_dotenv()
from sklearn.preprocessing import MinMaxScaler

In [29]:
# ChatGPTを使うのでAPIキーを設定
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
gpt_client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [30]:
# DB接続処理
# Neo4jに接続
uri = "bolt://localhost:7687"
user = "neo4j"
password = "abcd7890"

# ドライバを作成
driver = GraphDatabase.driver(uri, auth=(user, password))

In [31]:
# ChatGPTを使って意味のある文章を生成する
def get_gpt_result(prompt):
    response = gpt_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip("\n")


# レシピから情報を抽出する
def extract_ingredient_name(text):
    prompt = f"""
Extract the ingredient name from the text below. Remove any quantities, units, or modifiers. 
Convert multiple forms of the same ingredient into its singular form. 
Replace specific brand names, product names, or unique ingredient names (e.g., Epsom salt) with simple, general names (e.g., salt).

Input:
{text}

Output:
"""
    return get_gpt_result(prompt)

# 作業時間をminの揃える
def extract_minutes(text):
    prompt = f"""
Convert a given total_time string (e.g., "1 hr 45 min") into the total time in minutes. Ensure to handle cases with both "hr" and "min" or only "hr" or "min". Return the result as an integer representing the total minutes.
And remove any other information, including the word "minutes".

Input:
{text}

Output:
"""
    return get_gpt_result(prompt)[0]

# 作業時間を推測する
def guess_total_minutes(text):
    prompt = f"""
Estimate the total time from the text below. The total time is the sum of the prep time and cook time.
Input:
{text}

Output:
"""
    return get_gpt_result(prompt)[0]


In [32]:
# テキストフォーマットの指定
def format_text(text):
    if text is None:
        return ""
    text = text.replace('(', '_')
    text = text.replace(')', '_')
    text = text.replace("/", '_')
    text = text.replace(";", '_')
    text = text.replace(":", '_')
    text = text.replace("&", '_')
    text = text.replace("[", '')
    text = text.replace("]", '')    
    text = text.replace('"', '')
    text = text.replace('<', '')
    text = text.replace('>', '')
    text = text.replace(', ', ',')
    text = text.replace('.', '')
    text = text.replace(',', '_')
    text = text.replace('-', ' ')
    text = text.replace(' ', '_')
    text = text.replace('\n', '')
    text = text.replace('%', '')
    text = text.replace("'", '')
    text = text.replace("/t", '')
    text = text.replace("\\", '')
    text = text.replace("é", '')
    text = text.replace("ç", '')
    text = text.replace("+", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("ã", '')
    text = text.replace("â", '')
    text = text.replace("ƒ", '')
    text = text.replace("€", '')
    text = text.replace('”', '')
    text = text.replace('`', '')
    text = text.replace('!', '')
    text = text.replace('|', '')
    text = text.replace("\n", "")
    text = text.replace("\r", "")
    text = re.sub(r'\s+', '', text)
    text = text.lower()
    return text

def escape_sql_string(text):
    text = text.rstrip("\\")
    return text.replace("'", "''")

def clean_title(title):
    # UTF-8にエンコードし、デコード時にエラーを無視して文字化けを削除
    clean_title = title.encode('utf-8', 'replace').decode('utf-8', 'ignore')
    return clean_title

In [33]:
# トークン化して保存する
def tokenize_save(key_name, sentences):
    # NumPyの配列に変換
    tokens = []
    for sentence in sentences:
        if sentence is None:
            continue
        
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
            
        # 小文字化し、特殊文字や句読点を削除してトークン化
        words = re.findall(r'\b[a-zA-Z0-9_]+\b', sentence.lower())
        tokens.append(words)
        
    np_sentences = np.array(tokens, dtype=object)

    # NumPyの配列をdumpして保存
    np.save(f"../datas/word2_vec/{key_name}.npy", np_sentences)
    with open(f"../datas/word2_vec/{key_name}.txt", 'w') as f:
        json.dump(tokens, f)

In [34]:
def find_food(tx, search_term):
    find_food_query = """
    CALL db.index.fulltext.queryNodes("food_sub_index_text_search", $search_term)
    YIELD node, score
    ORDER BY score DESC, size(node.name)
    LIMIT 1
    RETURN node.id as node_id, node.name as node_name, node.flavor_vector as flavor_vector, node.word_vector as word_vector, score    
    """
    return tx.run(find_food_query, search_term=search_term).data()

In [35]:
JSON_INPUT_PATH = "../data/html_recipes/out_jsons/"
JSON_OUTPUT_PATH = "../data/formatted_json2.json"
def get_json_files(file):
    json_files = os.listdir(file)
    return json_files

In [36]:
recipes = {}

for file in get_json_files(JSON_INPUT_PATH):    
    with open(f"{JSON_INPUT_PATH}{file}") as f:
        json_data = json.load(f)
             
        for item in json_data:
            if not 'ingredients' in item or item['ingredients'] is None:
                continue

            ingredients = []
            for ingri in item['ingredients']:
                if ingredients is None:
                    continue
                ingredients.append(extract_ingredient_name(ingri))
            
            total_time = ""
            if "total_time" in item:
                total_time = extract_minutes(item['total_time'])
            else:
                total_time = guess_total_minutes(item['instructions'])
            
            rating = ""
            if "rating" in item:
                rating = item['rating']
            
            reviews = ""
            if "reviews" in item:
                reviews = item['reviews']
            else:
                reviews = 0
                
            name = clean_title(item['title'])
            json_input = {
                "name": name,
                "rating": rating,
                "reviews": reviews,
                "total_time": total_time,
                "ingredients": ingredients
            }
            recipes[name] = json_input

with open(JSON_OUTPUT_PATH, 'w') as f:
    json.dump(recipes, f)

In [37]:
print(recipes)
with open(JSON_OUTPUT_PATH, 'w') as f:
    json.dump(recipes, f)

{'Arroz Verde with Hatch Chiles': {'name': 'Arroz Verde with Hatch Chiles', 'rating': '', 'reviews': 0, 'total_time': 'T', 'ingredients': ['green chile', 'chicken broth', 'cilantro', 'olive oil', 'rice', 'onion', 'garlic', 'chicken bouillon', 'salt', 'cumin']}, 'Churro Cheesecake Bars': {'name': 'Churro Cheesecake Bars', 'rating': '', 'reviews': 0, 'total_time': 'T', 'ingredients': ['sugar', 'cinnamon', 'brown sugar cinnamon spreadable cream cheese', 'egg', 'vanilla', 'orange zest', 'crescent dough', 'butter', 'caramel']}, 'Cornbread Taco Bake': {'name': 'Cornbread Taco Bake', 'rating': '', 'reviews': 0, 'total_time': 'T', 'ingredients': ['beef', 'taco seasoning', 'tomatoes, chilies', 'corn', 'chili beans', 'cheddar cheese', 'cornbread', 'egg', 'milk', 'cream, onion']}, 'Blackened Shrimp Tacos with Pineapple': {'name': 'Blackened Shrimp Tacos with Pineapple', 'rating': '', 'reviews': 0, 'total_time': 'T', 'ingredients': ['paprika', 'onion', 'oregano', 'cayenne pepper', 'garlic', 'peppe