In [291]:
import pandas as pd
import numpy as np
import mysql.connector
from openai import OpenAI
import os
import re
import json 
from dotenv import load_dotenv
load_dotenv()

True

In [292]:
# ChatGPTを使うのでAPIキーを設定
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
gpt_client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [293]:
# ChatGPTを使って意味のある文章を生成する
def get_gpt_result(prompt):
    response = gpt_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()


In [294]:
# DB接続処理
# MySQLに接続
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="password",
    database="foodb"
)

In [295]:
# トークン化して保存する
def tokenize_save(key_name, sentences):
    # NumPyの配列に変換
    tokens = []
    for sentence in sentences:
        if sentence is None:
            continue
        # 小文字化し、特殊文字や句読点を削除してトークン化
        words = re.findall(r'\b\w+\b', sentence.lower())
        tokens.append(words)
        
    np_sentences = np.array(tokens, dtype=object)

    # NumPyの配列をdumpして保存
    np.save(f"../datas/word2_vec/{key_name}.npy", np_sentences)
    with open(f"../datas/word2_vec/{key_name}.txt", 'w') as f:
        json.dump(tokens, f)

##### 食品グループ

In [137]:
# 食品グループとサブグループの取得
cursor = connection.cursor(dictionary=True)
food_groups = []
query = "select food_group, food_subgroup from foods where food_group is not null group by food_group, food_subgroup"
cursor.execute(query)

# 1行ずつ結果を取得
for row in cursor.fetchall():
    sentences = f"{row['food_group'].lower()} include {row['food_subgroup'].lower()}"
    food_groups.append(sentences)
    
    prompt = f"Generate a sentence that explains the relationship between {row['food_group']} and {row['food_subgroup']}. For example: '{row['food_group']} include {row['food_subgroup']} and are commonly used in various dishes for flavor.'"
    gpt_scentense = get_gpt_result(prompt)
    food_groups.append(gpt_scentense)


In [138]:
# 食品グループのデータを保存
tokenize_save("food_groups", food_groups)

#### 食品データを保存

In [236]:
# 食品データを保存
cursor = connection.cursor(dictionary=True)
cursor2 = connection.cursor(dictionary=True)
foods = []
query = """
select id, LOWER(name) as name, LOWER(name_scientific) as name_scientific, LOWER(food_group) as food_group, 
LOWER(food_subgroup) as food_subgroup, LOWER(food_type) as food_type, LOWER(category) as category, LOWER(description) as description from foods
"""
cursor.execute(query)

# 1行ずつ結果を取得
all_items = cursor.fetchall()
for item in all_items:
    sentence_text = [
        f"{item['name']} or {item['name_scientific']} are a type of {item['category']}.",
        f"{item['food_group']} is under the subgroup of {item['food_subgroup']}. ",
        f"It is commonly used in {item['food_type']} applications. ",
        f"Description: {item['description']}",
    ]
    
    # サブタイプを取得
    query2 = f"SELECT LOWER(orig_food_common_name) as orig_food_common_name FROM contents WHERE food_id = {item['id']} and orig_food_common_name !='None' GROUP BY orig_food_common_name"
    try:
        cursor2.execute(query2)
    except Exception as e:
        print("Error:", e)
        pass
        continue

    for item2 in cursor2.fetchall():
        name = re.sub(r'\s*\(.*?\)', '', item2['orig_food_common_name'])
        name = name.replace('"', '').replace("'", '').replace('’', '').replace('(','').replace(')','')
        sentence_text.append(f"{item['name']} has subtypes like {name}.") 
    
    # GPT-3.5で追加の説明文を生成
    prompt = f"Can you tell me how the ingredient '{item['name']}' is generally used in cooking? If there are any interesting or unique uses, please provide examples."
    gpt_sentence = get_gpt_result(prompt)
    sentence_text.append(gpt_sentence)
    foods.extend(sentence_text)

KeyboardInterrupt: 

In [181]:
# 食品グループのデータを保存
tokenize_save("foods", foods)

#### 香りのデータを保存

In [257]:
# 食品データを保存
cursor = connection.cursor(dictionary=True)
flavors = []
query = "select food_id, orig_food_common_name from contents group by food_id, orig_food_common_name"
cursor.execute(query)

# 1行ずつ結果を取得
all_items = cursor.fetchall()
for item in all_items:
    if item["orig_food_common_name"] is None:
        continue
    
    text = item["orig_food_common_name"].strip(" \\")
    text = text.replace('"', '\\"')
    query2 = f"""
    SELECT flavors.name
    FROM
    foods INNER JOIN contents ON foods.id = contents.food_id
    INNER JOIN compounds ON contents.source_id = compounds.id
    LEFT JOIN compounds_flavors on compounds.id = compounds_flavors.compound_id
    LEFT JOIN flavors on compounds_flavors.flavor_id = flavors.id
    WHERE contents.citation_type ='DATABASE' AND contents.source_type = 'Compound' AND contents.standard_content > 0.0 AND contents.food_id={item["food_id"]}
    AND orig_food_common_name ="{text}" and flavors.name is not null
    GROUP BY flavors.name
    ORDER by MAX(contents.standard_content) limit 15
    """
    sentences = []
    cursor2 = connection.cursor(dictionary=True)
    try:
        cursor2.execute(query2)
    except Exception as e:
        print("Error:", e)
        print(query2)
        pass
        continue
    
    for item2 in cursor2.fetchall():
        sentences.append(f"{text} has a flavor of {item2['name']}.")
        
    flavors.extend(sentences)

KeyboardInterrupt: 

In [258]:
# 食品グループのデータを保存
tokenize_save("flavors", flavors)

### 一般レシピのデータを保存

In [277]:
# ディレクトリパスを指定
directory_path = '../datas/json_recipes'
ingredients = []
# ディレクトリ内のファイルを1つずつ読み込む
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        if not "recipes_results" in data:
            continue
        
        for recipe in data["recipes_results"]:
            if not "ingredients" in recipe:
                continue
            
            if len(recipe['ingredients']) == 0:
                continue
            ingredients.append(" ".join(recipe['ingredients']))

In [289]:
# レシピ材料のデータを保存
tokenize_save("ingredients1", ingredients)