In [369]:
import pandas as pd
import numpy as np
import mysql.connector
from openai import OpenAI
import os
import re
import json 
from dotenv import load_dotenv
load_dotenv()

True

In [370]:
# ChatGPTを使うのでAPIキーを設定
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
gpt_client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [371]:
# ChatGPTを使って意味のある文章を生成する
def get_gpt_result(prompt):
    response = gpt_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()


In [372]:
# DB接続処理
# MySQLに接続
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="password",
    database="foodb"
)

In [377]:
# トークン化して保存する
def tokenize_save(key_name, sentences):
    # NumPyの配列に変換
    tokens = []
    for sentence in sentences:
        if sentence is None:
            continue
        # 小文字化し、特殊文字や句読点を削除してトークン化
        words = re.findall(r'\b[a-zA-Z0-9_]+\b', sentence.lower())
        tokens.append(words)
        
    np_sentences = np.array(tokens, dtype=object)

    # NumPyの配列をdumpして保存
    np.save(f"../datas/word2_vec/{key_name}.npy", np_sentences)
    with open(f"../datas/word2_vec/{key_name}.txt", 'w') as f:
        json.dump(tokens, f)

##### 食品グループ

In [378]:
# テキストフォーマットの指定
def format_text(text):
    text = text.replace('-', ' ')
    text = text.replace(' ', '_')
    text = text.replace('\n', '')
    text = text.lower()
    return text

In [384]:
# 食品グループとサブグループの取得
cursor = connection.cursor(dictionary=True)
food_groups = []
query = "select food_group, food_subgroup from foods where food_group is not null group by food_group, food_subgroup"
cursor.execute(query)

# 1行ずつ結果を取得
for row in cursor.fetchall():
    if row["food_subgroup"] == "Tropical fruits":
        print(format_text(row["food_subgroup"]))

    row["food_group"] = format_text(row["food_group"])
    row["food_subgroup"] = format_text(row["food_subgroup"])
    sentences = f"{row['food_group']} include {row['food_subgroup']}"
    food_groups.append(sentences)
    
    prompt = f"Generate a sentence that explains the relationship between {row['food_group']} and {row['food_subgroup']}. For example: '{row['food_group']} include {row['food_subgroup']} and are commonly used in various dishes for flavor.'"
    gpt_scentense = get_gpt_result(prompt)
    food_groups.append(gpt_scentense)

tropical_fruits
['herbs_and_spices include herbs', 'vegetables include cabbages', 'fruits include tropical_fruits', 'vegetables include onion_family_vegetables', 'nuts include nuts', 'herbs_and_spices include spices', 'vegetables include root_vegetables', 'vegetables include shoot_vegetables', 'cereals_and_cereal_products include cereals', 'vegetables include leaf_vegetables', 'herbs_and_spices include oilseed_crops', 'pulses include peas', 'teas include teas', 'vegetables include fruit_vegetables', 'gourds include gourds', 'fruits include citrus', 'coffee_and_coffee_products include coffee', 'fruits include pomes', 'fruits include berries', 'fruits include other_fruits', 'soy include soy', 'vegetables include tubers', 'pulses include lentils', 'pulses include other_pulses', 'pulses include beans', 'fruits include drupes', 'vegetables include stalk_vegetables', 'cocoa_and_cocoa_products include cocoa', 'beverages include fermented_beverages', 'cereals_and_cereal_products include other_

In [365]:
# 食品グループのデータを保存
tokenize_save("food_groups", food_groups)

#### 食品データを保存

In [366]:
# 食品データを保存
cursor = connection.cursor(dictionary=True)
cursor2 = connection.cursor(dictionary=True)
foods = []
query = """
select id, name as name, name_scientific, food_group, 
food_subgroup, food_type, category, description from foods
"""
cursor.execute(query)

# 1行ずつ結果を取得
all_items = cursor.fetchall()
for item in all_items:
    item['name'] = format_text(item['name'])
    item['name_scientific'] = format_text(item['name_scientific'])
    item['category'] = format_text(item['category'])
    item['food_group'] = format_text(item['food_group'])
    item['food_subgroup'] = format_text(item['food_subgroup'])
    item['food_type'] = format_text(item['food_type'])
    item['description'] = format_text(item['description'])
    
    sentence_text = [
        f"{item['name']} or {item['name_scientific']} are a type of {item['category']}.",
        f"{item['food_group']} is under the subgroup of {item['food_subgroup']}. ",
        f"It is commonly used in {item['food_type']} applications. ",
        f"Description: {item['description']}",
    ]
    
    # サブタイプを取得
    query2 = f"SELECT LOWER(orig_food_common_name) as orig_food_common_name FROM contents WHERE food_id = {item['id']} and orig_food_common_name !='None' GROUP BY orig_food_common_name"
    try:
        cursor2.execute(query2)
    except Exception as e:
        print("Error:", e)
        pass
        continue

    for item2 in cursor2.fetchall():
        name = re.sub(r'\s*\(.*?\)', '', item2['orig_food_common_name'])
        name = name.replace('"', '').replace("'", '').replace('’', '').replace('(','').replace(')','')
        sentence_text.append(f"{item['name']} has subtypes like {name}.") 
    
    # GPT-3.5で追加の説明文を生成
    prompt = f"Can you tell me how the ingredient '{item['name']}' is generally used in cooking? If there are any interesting or unique uses, please provide examples."
    gpt_sentence = get_gpt_result(prompt)
    sentence_text.append(gpt_sentence)
    foods.extend(sentence_text)

KeyboardInterrupt: 

In [312]:
# 食品グループのデータを保存
tokenize_save("foods", foods)

#### 香りのデータを保存

In [313]:
# 食品データを保存
cursor = connection.cursor(dictionary=True)
flavors = []
query = "select food_id, orig_food_common_name from contents group by food_id, orig_food_common_name"
cursor.execute(query)

# 1行ずつ結果を取得
all_items = cursor.fetchall()
for item in all_items:
    if item["orig_food_common_name"] is None:
        continue
    
    text = item["orig_food_common_name"].strip(" \\")
    text = text.replace('"', '\\"')
    query2 = f"""
    SELECT flavors.name
    FROM
    foods INNER JOIN contents ON foods.id = contents.food_id
    INNER JOIN compounds ON contents.source_id = compounds.id
    LEFT JOIN compounds_flavors on compounds.id = compounds_flavors.compound_id
    LEFT JOIN flavors on compounds_flavors.flavor_id = flavors.id
    WHERE contents.citation_type ='DATABASE' AND contents.source_type = 'Compound' AND contents.standard_content > 0.0 AND contents.food_id={item["food_id"]}
    AND orig_food_common_name ="{text}" and flavors.name is not null
    GROUP BY flavors.name
    ORDER by MAX(contents.standard_content) limit 15
    """
    sentences = []
    cursor2 = connection.cursor(dictionary=True)
    try:
        cursor2.execute(query2)
    except Exception as e:
        print("Error:", e)
        print(query2)
        pass
        continue
    
    for item2 in cursor2.fetchall():
        sentences.append(f"{text} has a flavor of {item2['name']}.")
        
    flavors.extend(sentences)

In [314]:
# 食品グループのデータを保存
tokenize_save("flavors", flavors)

### 一般レシピのデータを保存

In [315]:
# ディレクトリパスを指定
directory_path = '../datas/json_recipes'
ingredients = []
# ディレクトリ内のファイルを1つずつ読み込む
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if not "recipes_results" in data:
            continue
        
        for recipe in data["recipes_results"]:
            if not "ingredients" in recipe:
                continue
            
            if len(recipe['ingredients']) == 0:
                continue
            ingredients.append(" ".join(recipe['ingredients']))

In [316]:
# レシピ材料のデータを保存
tokenize_save("ingredients1", ingredients)