In [64]:
from neo4j import GraphDatabase
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import numpy as np
from gensim.models import Word2Vec

In [65]:
# DB接続処理
# Neo4jに接続
uri = "bolt://localhost:7687"
user = "neo4j"
password = "abcd7890"

# ドライバを作成
driver = GraphDatabase.driver(uri, auth=(user, password))

In [66]:
# データ読み込み
molecule_db = pd.read_csv('../datas/molecules.csv', names=['id', 'none', 'pubchem_id', 'common_name', 'flavor_profile'], header=0)
flavor_db = pd.read_csv('../datas/flavor_db.csv', names=['id', 'entry_id', 'alias', 'synonyms', 'scientific_name', 'category', 'molecules'], header=0)

flavor_db.drop(['id'], axis=1, inplace=True)
flavor_db.tail()

Unnamed: 0,Unnamed: 1,entry_id,alias,synonyms,scientific_name,category,molecules
935,935.0,0,egg,{'Egg'},Chicken,Animal Product,"{6274, 5311110, 644104, 9609, 18827, 527, 1960..."
936,936.0,979,olive oil,{''},Olea europaea L.,additive,"{6184, 31260, 5281168, 8103}"
937,,980,Baking powder,,,,"{11265, 62465, 644104, 12297, 31242, 527, 4114..."
938,,981,Baking soda,,,,{10340}
939,,982,alum,,,,{24856}


In [67]:
molecule_db.tail(20)

Unnamed: 0,id,none,pubchem_id,common_name,flavor_profile
1771,1771,1771.0,23676745,Potassium Sorbate,{'odorless'}
1772,1772,1772.0,24832101,"Santalol, alpha- and beta-","{'sandalwood', 'sweet', 'woody', 'deep'}"
1773,1773,1773.0,25021769,"2-Propen-1-one, 3-(4-hydroxyphenyl)-1-phenyl-",{'bitter'}
1774,1774,1774.0,44229138,(RS)-norcoclaurinium,"{'milky', 'sweet', 'fruity'}"
1775,1775,1775.0,46779070,S-Methyl 4-methylpentanethioate,{''}
1776,1776,1776.0,53425122,1-(Ethyltrisulfanyl)propane,"{'onion', 'alliaceous', 'green', 'garlic'}"
1777,1777,1777.0,53472027,D-Isoleucine Methyl Ester Hydrochloride,{''}
1778,1778,1778.0,54670067,l-ascorbic acid,{''}
1779,1779,1779.0,57346909,"4H-Pyrrolo[2,1-d]-1,3,5-dithiazine,tetrahydro-...",{''}
1780,1780,1780.0,57357963,33368-82-0,"{'sulfurous', 'alliaceous'}"


In [68]:
flavor_db.tail(2)

Unnamed: 0,Unnamed: 1,entry_id,alias,synonyms,scientific_name,category,molecules
938,,981,Baking soda,,,,{10340}
939,,982,alum,,,,{24856}


In [69]:
# kmeans分析のためにベクトル化したflavor_profileを作成してEntryに保存する
# Make all list of all flavors
all_flavors = molecule_db['flavor_profile'].apply(lambda x: x.replace("'", "").strip('{}').split(', ')).tolist()

# Word2Vecモデルを学習
model = Word2Vec(sentences=all_flavors, vector_size=100, window=5, min_count=1, sg=1)

# フレーバーのベクトルを確認
green_vector = model.wv['green']
print(green_vector)

[-0.11024731  0.08865121  0.00104234  0.05108663  0.02458184 -0.19736837
  0.12629808  0.31930017 -0.17324746  0.08396756 -0.03321015 -0.20425396
  0.00152838  0.03489222 -0.00610784 -0.09907925 -0.01383009 -0.19631614
 -0.11185078 -0.3518713   0.03383446  0.06877746  0.05494044 -0.08592452
 -0.11032649  0.02722137 -0.06767049 -0.07568916 -0.20336595 -0.00994436
  0.16688164 -0.05757504  0.08610387 -0.08225907 -0.15717286  0.14035916
  0.06974806 -0.12797907  0.03121371 -0.22622727  0.01563809 -0.20458741
 -0.07907134  0.06918976  0.1550309  -0.00907886 -0.16905282 -0.05399366
  0.2152504   0.08323916  0.05968301 -0.1051513   0.06531842  0.00069853
 -0.06990708  0.04625301  0.10835875 -0.05259502 -0.20486398  0.05590883
  0.04419304  0.02348708 -0.00402463 -0.11754941 -0.14710538  0.1499636
  0.05159127  0.25151354 -0.2215172   0.27388817 -0.0687238   0.00813129
  0.13377978 -0.04101782  0.1346872   0.06986355  0.05916374 -0.05064672
 -0.12126765 -0.00447821 -0.1270708  -0.05232897 -0.

In [70]:
# Make all list of all molecules

## Molecule ノードが存在するか確認し、存在すれば削除する関数
def initialize_molecules(tx):
    # Molecule ノードの存在を確認
    tx.run("MATCH (e:Entry)-[r:CONTAINS]->(m:Molecule) DELETE r;")
    tx.run("""
        MATCH (m:Molecule)
        DETACH DELETE m;
    """)        

## Moleculeに値を投入する関数
def insert_molecules(tx, molecule_data):
    # flavor_profileがリスト形式の場合、直接使用
    flavor_str = molecule_data['flavor_profile']
    flavors = flavor_str.replace("'", "").strip('{}').split(', ')
      
    if flavors != ['']:
        flavor_vector = np.mean([model.wv[flavor] for flavor in flavors if flavor in model.wv], axis=0)
    else:
        flavor_vector = []
        
    tx.run("""
    CREATE (m:Molecule {
        id: $id,
        pubchem_id: $pubchem_id,
        common_name: $common_name,
        flavor_profile: $flavor_profile,
        flavor_vector: $flavor_vector
    })
    """, 
    id=int(molecule_data['id']), 
    pubchem_id=int(molecule_data['pubchem_id']),
    common_name=molecule_data['common_name'],
    flavor_vector=flavor_vector,
    flavor_profile=flavors)

def append_molecules_index(tx):
    # flavor_vectorのNULLを空リストに変換
    tx.run("""MATCH (m:Molecule)
        WHERE m.flavor_vector IS NULL
        SET m.flavor_vector = []
        RETURN m;
        """)
    # idに対して一意制約を追加（構文修正済み）
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Molecule) REQUIRE m.id IS UNIQUE")
    
    # pubchem_idにインデックスを作成（構文修正済み）
    tx.run("CREATE INDEX IF NOT EXISTS FOR (m:Molecule) ON (m.pubchem_id)")
    
    # flavor_profileにインデックスを作成（構文修正済み）
    tx.run("CREATE INDEX IF NOT EXISTS FOR (m:Molecule) ON (m.flavor_profile)")

# Moleculesのデータを挿入
with driver.session() as session:
    session.execute_write(initialize_molecules)
    for i, molecule in molecule_db.iterrows():
        session.execute_write(insert_molecules, molecule)
    session.execute_write(append_molecules_index)


Forbidden: {code: Neo.ClientError.Transaction.ForbiddenDueToTransactionType} {message: Tried to execute Schema modification after executing Write query}

In [54]:
# Make all list of all entries

## Molecule ノードが存在するか確認し、存在すれば削除する関数
def initialize_entry(tx):
    # Molecule ノードの存在を確認
    result = tx.run("MATCH (m:Entry) RETURN m LIMIT 1")
    if result.single():
        tx.run("MATCH (m:Entry) DETACH DELETE m")
    
## Moleculeに値を投入する関数
def insert_entry(tx, entry_data):
    molecules_str = str(entry_data['molecules'])
    molecules = molecules_str.replace("'", "").strip('{}').split(', ')
    molecules = [s for s in molecules if s.strip()]
	
    synonyms_str = str(entry_data['synonyms'])
    synonyms = synonyms_str.replace("'", "").strip('{}').split(', ')
    search_query =  ' '.join(synonyms) + ' ' + str(entry_data['scientific_name']) + ' ' + str(entry_data['category'])
 
    tx.run("""
    CREATE (e:Entry {
        id: $entry_id,
        name: $alias,
        synonyms: $synonyms,
        scientific_name: $scientific_name,
        category: $category,
        search_query: $search_query
    })
    """, 
    entry_id=int(entry_data['entry_id']),
    alias=entry_data['alias'],
    synonyms=entry_data['synonyms'],
    scientific_name=entry_data['scientific_name'],
    category=entry_data['category'],
    search_query=search_query)
    
    for molecule_id in molecules:
        tx.run("""
        MATCH (m:Molecule {id: $molecule_id})
        MATCH (e:Entry {id: $entry_id})
        MERGE (e)-[:CONTAINS]->(m)
        """, 
        entry_id=int(entry_data['entry_id']),
        molecule_id=int(molecule_id))

def append_entry_index(tx):
    # idに対して一意制約を追加（構文修正済み）
    tx.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Entry) REQUIRE m.id IS UNIQUE")
    
    # moleculesにインデックスを作成（構文修正済み）
    tx.run("CREATE INDEX IF NOT EXISTS FOR (m:Entry) ON (m.molecules)")

    # moleculesにインデックスを作成（構文修正済み）
    tx.run("DROP INDEX my_text_index IF EXISTS")
    # 新しいインデックスを作成
    tx.run("CREATE FULLTEXT INDEX my_text_index FOR (n:Entry) ON EACH [n.search_query]")
    
# Moleculesのデータを挿入
with driver.session() as session:
    session.execute_write(initialize_entry)
    for i, molecule in flavor_db.iterrows():
        session.execute_write(insert_entry, molecule)
    session.execute_write(append_entry_index)


In [55]:
# ドライバをクローズ
driver.close()