In [1]:
import pandas as pd
import numpy as np
import mysql.connector
from openai import OpenAI
import os
import re
import json 
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# ChatGPTを使うのでAPIキーを設定
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
gpt_client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [3]:
# ChatGPTを使って意味のある文章を生成する
def get_gpt_result(prompt):
    response = gpt_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()


In [4]:
# DB接続処理
# MySQLに接続
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="password",
    database="foodb"
)

In [5]:
# トークン化して保存する
def tokenize_save(key_name, sentences):
    # NumPyの配列に変換
    tokens = []
    for sentence in sentences:
        if sentence is None:
            continue
        
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
            
        # 小文字化し、特殊文字や句読点を削除してトークン化
        words = re.findall(r'\b[a-zA-Z0-9_]+\b', sentence.lower())
        tokens.append(words)
        
    np_sentences = np.array(tokens, dtype=object)

    # NumPyの配列をdumpして保存
    np.save(f"../datas/word2_vec/{key_name}.npy", np_sentences)
    with open(f"../datas/word2_vec/{key_name}.txt", 'w') as f:
        json.dump(tokens, f)

In [6]:
# テキストフォーマットの指定
def format_text(text):
    if text is None:
        return ""
    text = text.replace('(', '_')
    text = text.replace(')', '_')
    text = text.replace("/", '_')
    text = text.replace(";", '_')
    text = text.replace(":", '_')
    text = text.replace("&", '_')
    text = text.replace("[", '')
    text = text.replace("]", '')    
    text = text.replace('"', '')
    text = text.replace('<', '')
    text = text.replace('>', '')
    text = text.replace(', ', ',')
    text = text.replace('.', '')
    text = text.replace(',', '_')
    text = text.replace('-', ' ')
    text = text.replace(' ', '_')
    text = text.replace('\n', '')
    text = text.replace('%', '')
    text = text.replace("'", '')
    text = text.replace("/t", '')
    text = text.replace("\\", '')
    text = text.replace("é", '')
    text = text.replace("ç", '')
    text = text.replace("+", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("`", '')
    text = text.replace("ã", '')
    text = text.replace("â", '')
    text = text.replace("ƒ", '')
    text = text.replace("€", '')
    text = text.replace('”', '')
    text = text.replace('`', '')
    text = text.replace('!', '')
    text = text.replace('|', '')
    text = text.replace("\n", "")
    text = text.replace("\r", "")
    text = re.sub(r'\s+', '', text)
    text = text.lower()
    return text

##### 食品グループ

In [7]:
# 食品グループとサブグループの取得
cursor = connection.cursor(dictionary=True)
food_groups = []
query = "select food_group, food_subgroup from foods where food_group is not null group by food_group, food_subgroup"
cursor.execute(query)

# 1行ずつ結果を取得
for row in cursor.fetchall():
    row["food_group"] = format_text(row["food_group"])
    row["food_subgroup"] = format_text(row["food_subgroup"])
    sentences = f"{row['food_group']} include {row['food_subgroup']}"
    food_groups.append(sentences)
    
    prompt = f"Generate a sentence that explains the relationship between `{row['food_group']}` and `{row['food_subgroup']}`. For example: '`{row['food_group']}` include `{row['food_subgroup']}` and are commonly used in various dishes for flavor.'"
    gpt_scentense = get_gpt_result(prompt)
    food_groups.append(gpt_scentense)

KeyboardInterrupt: 

In [None]:
# 食品グループのデータを保存
tokenize_save("food_groups", food_groups)

#### 食品データを保存

In [None]:
# 食品データを保存
cursor = connection.cursor(dictionary=True)
cursor2 = connection.cursor(dictionary=True)
foods = []
query = """
select id, name as name, name_scientific, food_group, 
food_subgroup, category, description from foods
"""
cursor.execute(query)

# 1行ずつ結果を取得
all_items = cursor.fetchall()
for item in all_items:
    item['name'] = format_text(item['name'])
    item['name_scientific'] = item['name_scientific']
    item['category'] = format_text(item['category'])
    item['food_group'] = format_text(item['food_group'])
    item['food_subgroup'] = format_text(item['food_subgroup'])
    item['description'] = item['description']
    
    sentence_text = []
    
    if item['category'] is not None and len(item['category']) > 0:
        sentence_text.append(f"{item['name']} or {item['name_scientific']} are a type of {item['category']}.") 
        
    if item['food_group'] is not None and len(item['food_group']) > 0:       
        sentence_text.append(f"{item['food_group']} is under the subgroup of {item['food_subgroup']}. "),

    if item['name'] is not None and len(item['name']) > 0:
        sentence_text.append([item['name'], "means"].extend(item['name'].split('_')))
    
    # サブタイプを取得
    query2 = f"SELECT LOWER(orig_food_common_name) as orig_food_common_name FROM contents WHERE food_id = {item['id']} and orig_food_common_name !='None' GROUP BY orig_food_common_name"
    try:
        cursor2.execute(query2)
    except Exception as e:
        print("Error:", e)
        pass
        continue

    for item2 in cursor2.fetchall():
        item2['orig_food_common_name'] = format_text(item2['orig_food_common_name'])
        sentence_text.append(f"{item['name']} has subtypes like {item2['orig_food_common_name']}.") 
        sentence_text.append(item2['orig_food_common_name'].split('_'))
    
    # GPT-3.5で追加の説明文を生成
    prompt = f"Can you tell me how the ingredient '{item['name']}' is generally used in cooking? If there are any interesting or unique uses, please provide examples."
    gpt_sentence = get_gpt_result(prompt)
    sentence_text.append(gpt_sentence)
    foods.extend(sentence_text)

In [None]:
# 食品グループのデータを保存
tokenize_save("foods", foods)

NameError: name 'foods' is not defined

In [None]:
# 食品SubTypeのデータを保存
cursor = connection.cursor(dictionary=True)
sub_foods = []
query = """
SELECT 
    foods.name AS food_name, 
    foods.name_scientific AS food_name_scientific, 
    foods.food_group, 
    foods.food_subgroup, 
    contents.orig_food_common_name 
FROM 
    foods
INNER JOIN 
    contents ON foods.id = contents.food_id 
WHERE 
    foods.name IS NOT NULL 
GROUP BY 
    foods.name, 
    foods.name_scientific, 
    foods.food_group, 
    foods.food_subgroup, 
    contents.orig_food_common_name;
"""

cursor.execute(query)

# 1行ずつ結果を取得
all_items = cursor.fetchall()
for item in all_items:    
    food_name = format_text(item["food_name"])
    orig_food_name = format_text(item["orig_food_common_name"])
    
    sentences = []
    sentences.append(f"{food_name} has a sub_type of {orig_food_name}.")
    sentences.append(f"{food_name} is {item['food_name']}.")
    sentences.append(f"{orig_food_name} is {item['orig_food_common_name']}.")
    
    # GPT-3.5で追加の説明文を生成
    #prompt = f"Can you tell me how the ingredient '{food_name}' is generally used in cooking? If there are any interesting or unique uses, please provide examples."
    #gpt_sentence = get_gpt_result(prompt)
    #sentences.append(gpt_sentence)

    sub_foods.extend(sentences)

In [None]:
# 食品サブタイプのデータを保存
tokenize_save("food_sub_types", sub_foods)

#### 香りのデータを保存

In [None]:
 # 香りのデータを保存

cursor = connection.cursor(dictionary=True)
flavors = []
query = "select name, flavor_group, category from flavors where name is not null"
cursor.execute(query)

# 1行ずつ結果を取得
all_items = cursor.fetchall()
for item in all_items:
    flavor_name = format_text(item["name"])
    sentences.append(f"{flavor_name} is {item['name']}")
    if item["category"] is not None:
        sentences.append(f"{flavor_name} is a member of the {item['category']}")
    if item["flavor_group"] is not None:
        sentences.append(f"{flavor_name} is a member of the {item['flavor_group']} group")
    
    # GPT-3.5で追加の説明文を生成
    prompt = f"""
    Please describe the '{flavor_name}' flavor profile, 
    including its key characteristics and how it is generally perceived. Provide examples of foods or ingredients that have a '{flavor_name}' flavor.
    """
    gpt_sentence = get_gpt_result(prompt)
    sentences.append(gpt_sentence)

flavors.extend(sentences)

In [None]:
# 食品グループのデータを保存
tokenize_save("flavors", flavors)

### 一般レシピのデータを保存

In [None]:
# ディレクトリパスを指定
directory_path = '../datas/json_recipes'
ingredients = []
# ディレクトリ内のファイルを1つずつ読み込む
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if not "recipes_results" in data:
            continue
        
        for recipe in data["recipes_results"]:
            if not "ingredients" in recipe:
                continue
            
            if len(recipe['ingredients']) == 0:
                continue
            ingredients.append(" ".join(recipe['ingredients']))

In [None]:
# レシピ材料のデータを保存
tokenize_save("ingredients1", ingredients)