In [495]:
from neo4j import GraphDatabase
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import json
from collections import Counter

In [496]:
# DB接続処理
# Neo4jに接続
uri = "bolt://localhost:7687"
user = "neo4j"
password = "abcd7890"

# ドライバを作成
driver = GraphDatabase.driver(uri, auth=(user, password))

In [497]:
# データ読み込み
molecule_db = pd.read_csv('../datas/molecules.csv', names=['id', 'none', 'pubchem_id', 'common_name', 'flavor_profile'], header=0)
flavor_db = pd.read_csv('../datas/flavor_db.csv', names=['id', 'entry_id', 'alias', 'synonyms', 'scientific_name', 'category', 'molecules'], header=0)

flavor_db.drop(['id'], axis=1, inplace=True)
flavor_db.tail(3)

Unnamed: 0,Unnamed: 1,entry_id,alias,synonyms,scientific_name,category,molecules
937,,981,Baking powder,,,,"{11265, 62465, 644104, 12297, 31242, 527, 4114..."
938,,982,Baking soda,,,,{10340}
939,,983,alum,,,,{24856}


In [498]:
molecule_db.tail(3)

Unnamed: 0,id,none,pubchem_id,common_name,flavor_profile
1788,1788,,10340,Sodium Carbonate,odorless
1789,1789,,24856,Potassium alum,odorless
1790,1790,,24403,Tetrasodium Pyrophosphate,odorless


In [499]:
flavor_db.tail(2)

Unnamed: 0,Unnamed: 1,entry_id,alias,synonyms,scientific_name,category,molecules
938,,982,Baking soda,,,,{10340}
939,,983,alum,,,,{24856}


In [500]:
# kmeans分析のためにベクトル化したflavor_profileを作成してEntryに保存する
# Make all list of all flavors
all_flavors = molecule_db['flavor_profile'].apply(lambda x: x.replace("'", "").strip('{}').split(', ')).tolist()
unique_flavors = []
for flavors in all_flavors:
    for flavor in flavors:
        if flavor not in unique_flavors:
            unique_flavors.append(flavor)
            
# Word2Vecモデルを学習
#model = Word2Vec(sentences=all_flavors, vector_size=100, window=5, min_count=1, sg=1)
model = Word2Vec.load("../datas/fine_tuned_word2vec.model")
print(model.vector_size)  # 100になっているか確認

# フレーバーのベクトルを確認
green_vector = model.wv['green']
print(len(green_vector))

100
100


In [501]:
def similar_to_json(word):
    similar_words = model.wv.most_similar(positive=[word])
    similar_words_dict = {word: score for word, score in similar_words}
    json_data = json.dumps(similar_words_dict, indent=4)
    return json_data

In [502]:
# Make flavor list 

## Molecule ノードが存在するか確認し、存在すれば削除する関数
def initialize_flavors(tx):
    # Molecule ノードの存在を確認
    tx.run("MATCH (m:Molecule)-[:HAS_FLAVOR]->(f:Flavor) DETACH DELETE f;")
    tx.run("MATCH (f:Flavor) DETACH DELETE f;")

# create new record
def create_flavors(tx, flavor_name):
    vec = model.wv[flavor_name].tolist()  # NumPyベクトルをリストに変換
    most_similar = similar_to_json(flavor_name)
    
    tx.run("""
        CREATE (m:Flavor {
            id: $id,
            most_similar: $most_similar,
            flavor_vector: $flavor_vector
        })
        """,
        id=flavor_name,
        most_similar=most_similar,
        flavor_vector=vec)

# インデックス追加
def append_molecules_index(tx):
    # idに対して一意制約を追加
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (f:Flavor) REQUIRE m.id IS UNIQUE")

with driver.session() as session:
    session.execute_write(initialize_flavors)
    for flavor_name in unique_flavors:
        if flavor_name in ['None', '']:
            continue
            
        session.execute_write(create_flavors, flavor_name)

In [509]:
# Make Category list
all_categories = flavor_db['category'].apply(lambda x: str(x).replace("'", "").strip('{}').split(', ')).tolist()
unique_categories = set([cate for catelist in all_categories for cate in catelist])

# Make Category Group
# カテゴリと単語のデータ
category_data = {
    "earth": ['fungus', 'vegetable root', 'vegetable tuber', 'nut', 'plant', 'vegetable stem'],
    "green": ['plant', 'vegetable stem', 'cabbage', 'legume', 'gourd', 'vegetable', 'vegetable fruit', 'plant derivative'],
    "tropical": ['fruit', 'fruit essence', 'fruit citrus', 'fruit-berry', 'essential oil', 'flower', 'berry'],
    "oceans": ['seafood', 'fish'],
    "mountain": ['meat', 'dairy'],
    "field": ['cereal', 'bakery', 'maize'],
    "spice": ['seed', 'additive', 'spice', 'herb'],
    "oil": ['essential oil'],
    "drink": ['beverage', 'beverage caffeinated', 'beverage alcoholic'],
    "other": ['dish']
}

def find_category_for_item(item):
    for category, items in category_data.items():
        if item in items:
            return category
    return "other"

def initialize_categories(tx):
    tx.run("MATCH (c:Category) DETACH DELETE c;")
    tx.run("MATCH (c:CategoryGroup) DETACH DELETE c;")

def create_category_group(tx, cate_group):
    tx.run("""
        CREATE (c:CategoryGroup {
            id: $id
        })
        """,
        id=cate_group)
 
def create_categories(tx, category, cate_group):
    category_vector = model.wv[category]
    most_similar = similar_to_json(category)

    tx.run("""
        CREATE (c:Category {
            id: $name,
            name: $name,
            most_similar: $most_similar,
            category_vector: $category_vector
        })
        """,
        name=category, 
        most_similar=most_similar,
        category_vector=category_vector)
    
    tx.run("""
        MERGE (e:CategoryGroup {id: $cate_group})
        MERGE (c:Category {id: $category})
        MERGE (e)-[:GROUPED]->(c)
    """,
    cate_group=cate_group, category=category)


with driver.session() as session:
    session.execute_write(initialize_categories)
    
    for cate_group in category_data:
        session.execute_write(create_category_group, cate_group)
    
    for category in unique_categories:
        if category in ['None', 'nan', '']:
            continue
    
        session.execute_write(create_categories, category, find_category_for_item(category))

  with driver.session() as session:


In [504]:

# Make all list of all molecules
DEFAULT_VECTOR = np.zeros(100).tolist()

## Molecule ノードが存在するか確認し、存在すれば削除する関数
def initialize_molecules(tx):
    # Molecule ノードの存在を確認
    tx.run("MATCH (m:Molecule)-[r:HAS_FLAVOR]->(f:Flavor) DELETE r")
    tx.run("MATCH (m:Molecule) DETACH DELETE m;")

## Moleculeに値を投入する関数
def insert_molecules(tx, molecule_data):
    molecule_id = int(molecule_data['id'])
    # flavor_profileがリスト形式の場合、直接使用
    flavor_str = molecule_data['flavor_profile']
    flavors = flavor_str.replace("'", "").strip('{}').split(', ')

    flavor_vectors = []
    for flavor_name in flavors:
        if flavor_name not in ['' 'None'] and flavor_name in model.wv:
            flavor_vectors.append(model.wv[flavor_name])
    
    if len(flavor_vectors) > 0:
        # ベクトルの合計を計算
        flavor_vector = np.sum(flavor_vectors, axis=0)
    else:
        flavor_vector = np.zeros(100)  # デフォルトのベクトルを設定 (必要に応じて)

    # ベクトルのサイズが100であるか確認
    if len(flavor_vector.shape) > 0 and len(flavor_vector) != 100:
        print(molecule_id)

    
    tx.run("""
    CREATE (m:Molecule {
        id: $id,
        pubchem_id: $pubchem_id,
        common_name: $common_name,
        flavor_profile: $flavor_profile,
        flavor_vector: $flavor_vector
    })
    """, 
    id=molecule_id, 
    pubchem_id=int(molecule_data['pubchem_id']),
    common_name=molecule_data['common_name'],
    flavor_vector=flavor_vector,
    flavor_profile=flavors)
    
    for flavor in flavors:
        tx.run("""
            MATCH (m:Molecule {id: $molecule_id})
            MATCH (f:Flavor {id: $flavor_id})
            OPTIONAL MATCH (m)-[r:HAS_FLAVOR]->(f)
            WITH m, f, r
            WHERE r IS NULL
            MERGE (m)-[:HAS_FLAVOR]->(f)
        """, flavor_id=flavor, molecule_id=molecule_id)
    
def append_molecules_index(tx):
    # idに対して一意制約を追加（構文修正済み）
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Molecule) REQUIRE m.id IS UNIQUE")
    
    # pubchem_idにインデックスを作成（構文修正済み）
    tx.run("CREATE INDEX IF NOT EXISTS FOR (m:Molecule) ON (m.pubchem_id)")
    
    # flavor_profileにインデックスを作成（構文修正済み）
    tx.run("CREATE INDEX IF NOT EXISTS FOR (m:Molecule) ON (m.flavor_profile)")
         
def append_default_molecules(tx):
    tx.run("""
        MATCH (m:Molecule)
        WHERE any(x IN m.flavor_vector WHERE toString(x) = 'NaN')
        SET m.flavor_vector = $default_vector
        """,
        default_vector=DEFAULT_VECTOR
    )

# Moleculesのデータを挿入
with driver.session() as session:
    session.execute_write(initialize_molecules)
    for i, molecule in molecule_db.iterrows():
        session.execute_write(insert_molecules, molecule)
    session.execute_write(append_default_molecules)
    session.execute_write(append_molecules_index)


  with driver.session() as session:


In [505]:
# Make all list of all entries

## Entry ノードが存在するか確認し、存在すれば削除する関数
def initialize_entry(tx):
    # HAS_CATEGORY リレーションシップを削除
    tx.run("MATCH (m:Entry)-[r:HAS_CATEGORY]->(c:Category) DELETE r")
    # CONTAINS リレーションシップを削除
    tx.run("MATCH (m:Entry)-[r:CONTAINS]->(molecule:Molecule) DELETE r")
    # Entry ノード自体も削除
    tx.run("MATCH (m:Entry) DETACH DELETE m")
    
## Entryに値を投入する関数
def insert_entry(tx, entry_data):
    entry_id = int(entry_data['entry_id'])
    molecules_str = str(entry_data['molecules'])
    molecules = molecules_str.replace("'", "").strip('{}').split(', ')
    molecules = [s for s in molecules if s.strip()]
	
    synonyms_str = str(entry_data['synonyms'])
    synonyms = synonyms_str.replace("'", "").strip('{}').split(', ')
    search_query =  ' '.join(synonyms) + ' ' + str(entry_data['scientific_name']) + ' ' + str(entry_data['category'])
 
    most_similar = similar_to_json(entry_data['alias'])

    tx.run("""
    CREATE (e:Entry {
        id: $entry_id,
        name: $alias,
        synonyms: $synonyms,
        scientific_name: $scientific_name,
        category: $category,
        search_query: $search_query,
        most_similar: $most_similar
    })
    """, 
    entry_id=entry_id,
    alias=entry_data['alias'],
    synonyms=entry_data['synonyms'],
    scientific_name=entry_data['scientific_name'],
    category=entry_data['category'],
    search_query=search_query,
    most_similar=most_similar)
    
    category_name = str(entry_data['category'])
    tx.run("""
        MERGE (e:Entry {id: $entry_id})
        MERGE (c:Category {id: $category_name})
        MERGE (e)-[:HAS_CATEGORY]->(c)
     """, entry_id=entry_id, category_name=category_name)
    
    for molecule_id in molecules:
        tx.run("""
            MATCH (e:Entry {id: $entry_id})
            MATCH (m:Molecule {id: $molecule_id})
            WITH e, m
            MERGE (e)-[:CONTAINS]->(m)
        """, 
        entry_id=entry_id,
        molecule_id=int(molecule_id))
     
def append_entry_index(tx):
    # idに対して一意制約を追加（構文修正済み）
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Entry) REQUIRE m.id IS UNIQUE")
    
    # moleculesにインデックスを作成（構文修正済み）
    tx.run("CREATE INDEX IF NOT EXISTS FOR (m:Entry) ON (m.molecules)")

    # moleculesにインデックスを作成（構文修正済み）
    tx.run("DROP INDEX my_text_index IF EXISTS")
    # 新しいインデックスを作成
    tx.run("CREATE FULLTEXT INDEX my_text_index FOR (n:Entry) ON EACH [n.search_query]")
    
# Moleculesのデータを挿入
with driver.session() as session:
    session.execute_write(initialize_entry)
    for i, molecule in flavor_db.iterrows():
        session.execute_write(insert_entry, molecule)
    session.execute_write(append_entry_index)

In [506]:
def agregate_vector(tx):
    # sum
    tx.run("""
        MATCH (e:Entry)-[:CONTAINS]->(m:Molecule)
        WHERE m.flavor_vector IS NOT NULL
        WITH e, COLLECT(m.flavor_vector) AS flavor_vectors
        SET e.flavor_vector = REDUCE(
        sum = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 
        flavor_vector IN flavor_vectors | 
        [x IN RANGE(0, SIZE(sum)-1) | sum[x] + flavor_vector[x]]
        )
        RETURN e.name, e.flavor_vector;
    """)
    

# 集計したVectorデータをEntry側二保存する
with driver.session() as session:
    session.execute_write(agregate_vector)

In [507]:
# すべてのEntryを取得してPCAを適用
def append_pca_entries(session):
    entries = session.run("MATCH (e:Entry) RETURN e")
    
    for record in entries:
        entry_node = record["e"]  # ノード全体が含まれている
        entry_id = entry_node["id"]  # ノードのIDを取得

        # Entryに接続しているすべてのMoleculeのflavor_profileを統合する
        molecules = session.run("""
            MATCH (e:Entry {id: $entry_id})-[:CONTAINS]->(m:Molecule) RETURN m
        """, {"entry_id": entry_id})
        
        # 各Moleculeのflavor_profileをカウント
        flavor_counter = Counter()
        for molecule_record in molecules:
            molecule_node = molecule_record["m"]
            flavor_profiles = str(molecule_node["flavor_profile"]).strip("[]' ").split("', '")
            flavor_counter.update(flavor_profiles)
        
        counter_key_list = list(flavor_counter.keys())
               
        # Word2Vecでベクトル化（フレーバーの出現頻度に応じてベクトルを加重平均）
        vectors = []
        for flavor in counter_key_list:
            if flavor is None or str(flavor) in ["", "None", 'nan']:
                continue

            if flavor in model.wv:  # Word2Vecモデル内に存在するか確認
                vector = model.wv[flavor] # フレーバーの出現回数で加重
                vectors.append(vector)
        
        if len(vectors) == 0:
            flavor_scores = {}
            principal_flavor = ""
        # 複数次元の場合にのみPCAを適用
        elif len(vectors) > 1 and len(vectors[0]) > 1:

            # PCAを適用（Entryに関連するすべてのflavor_profileで）
            n_components = min(len(vectors), len(vectors[0])) if vectors else 1
                            
            pca = PCA(n_components=n_components)  # 必要な次元に圧縮
            pca.fit(vectors)
            
            # 主成分の寄与率を取得
            explained_variance_ratio = pca.explained_variance_ratio_
            # 主成分の寄与率を取得
            explained_components = pca.components_
            
            flavor_scores = {}
            # 各主成分に対する寄与度を取得して登場回数で加重
            for i, ratio in enumerate(explained_variance_ratio):
                flavor = counter_key_list[i]
                flavor_scores[flavor] = round(float(ratio) * 100, 2)
            
            flavor_scores = dict(sorted(flavor_scores.items(), key=lambda x: x[1], reverse=True))
            flavor_counter = dict(sorted(flavor_counter.items(), key=lambda x: x[1], reverse=True))
            principal_flavor = max(flavor_scores, key=flavor_counter.get)
        else:
            # 1次元の場合はそのままスコアとして保存
            #mean_vector = np.mean(vector, axis=0)  # 平均ベクトル
            #flavor_scores = {flavor: float(mean_vector) for flavor, vector in zip(flavor_counter.keys(), vectors)}            
            flavor_scores = {flavor: 1 for flavor in counter_key_list}            
            principal_flavor = counter_key_list[0]
            
        json_dump_scores = json.dumps(flavor_scores)
        json_dump_count = json.dumps(flavor_counter)
        
        # Entryノードにflavor_vector_pcaを保存
        session.run(
            "MATCH (e:Entry {id: $entry_id}) SET e.flavor_principal=$principal, e.flavor_count=$flavor_count, e.paring_scores=$flavor_scores",
            {"entry_id": entry_id, "flavor_count": json_dump_count, "flavor_scores": json_dump_scores, "principal": principal_flavor}
        )

# メイン処理実行
with driver.session() as session:
    append_pca_entries(session)


In [508]:
# ドライバをクローズ
driver.close()