In [23]:
from neo4j import GraphDatabase
import pandas as pd
from openai import OpenAI
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import json
from collections import Counter
import mysql.connector
import os
import re
from dotenv import load_dotenv
load_dotenv()
from sklearn.preprocessing import MinMaxScaler

In [24]:
# ChatGPTを使うのでAPIキーを設定
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
gpt_client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [25]:
# ChatGPTを使って意味のある文章を生成する
def get_gpt_result(prompt):
    response = gpt_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()


def extract_ingredient_name(text):
    prompt = f"""
Extract the ingredient name from the text below. Remove any quantities, units, or modifiers.

Input:
{text}

Output:
"""
    return get_gpt_result(prompt)

In [26]:
# テキストフォーマットの指定
def format_text(text):
    if text is None:
        return ""
    text = text.replace('(', '_')
    text = text.replace(')', '_')
    text = text.replace("/", '_')
    text = text.replace(";", '_')
    text = text.replace(":", '_')
    text = text.replace("&", '_')
    text = text.replace("[", '')
    text = text.replace("]", '')    
    text = text.replace('"', '')
    text = text.replace('<', '')
    text = text.replace('>', '')
    text = text.replace(', ', ',')
    text = text.replace('.', '')
    text = text.replace(',', '_')
    text = text.replace('-', ' ')
    text = text.replace(' ', '_')
    text = text.replace('\n', '')
    text = text.replace('%', '')
    text = text.replace("'", '')
    text = text.replace("/t", '')
    text = text.replace("\\", '')
    text = text.replace("é", '')
    text = text.replace("ç", '')
    text = text.replace("+", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("ã", '')
    text = text.replace("â", '')
    text = text.replace("ƒ", '')
    text = text.replace("€", '')
    text = text.replace('”', '')
    text = text.replace('`', '')
    text = text.replace('!', '')
    text = text.replace('|', '')
    text = text.replace("\n", "")
    text = text.replace("\r", "")
    text = re.sub(r'\s+', '', text)
    text = text.lower()
    return text

def escape_sql_string(text):
    text = text.rstrip("\\")
    return text.replace("'", "''")

In [27]:
# トークン化して保存する
def tokenize_save(key_name, sentences):
    # NumPyの配列に変換
    tokens = []
    for sentence in sentences:
        if sentence is None:
            continue
        
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
            
        # 小文字化し、特殊文字や句読点を削除してトークン化
        words = re.findall(r'\b[a-zA-Z0-9_]+\b', sentence.lower())
        tokens.append(words)
        
    np_sentences = np.array(tokens, dtype=object)

    # NumPyの配列をdumpして保存
    np.save(f"../datas/word2_vec/{key_name}.npy", np_sentences)
    with open(f"../datas/word2_vec/{key_name}.txt", 'w') as f:
        json.dump(tokens, f)

In [28]:
extract_ingredient_name('1 (14 ounce) can chicken broth')

'chicken broth'

In [31]:
JSON_PATH = "../datas/html_recipes/out_jsons"
def get_json_files():
    json_files = os.listdir(JSON_PATH)
    return json_files

sentences = []
for file in get_json_files():
    with open(f"{JSON_PATH}/{file}") as f:
        data = json.load(f)
        for item in data:
            if not 'ingredients' in item or item['ingredients'] is None:
                continue
            
            ingredients = []
            for ingri in item['ingredients']:        
                ingredients.append(extract_ingredient_name(ingri))
            sentences.append(ingredients)
    
tokenize_save("out_json_receipe", sentences)

In [39]:
JSON_PATH = "../datas/json_recipes/db-recipes.json"
sentences = []
with open(f"{JSON_PATH}") as f:
    data = json.load(f)
    for recipe_id, item in data.items():
        ingredients = []
        for ingri in item['ingredients']:        
            ingredients.append(extract_ingredient_name(ingri))
        sentences.append(ingredients)
    
tokenize_save("json_master_recipe", sentences)