In [17]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import itertools

In [18]:
molecules = pd.read_csv('../datas/molecules.csv', names=['id', 'none', 'pubchem_id', 'common_name', 'flavor_profile'], header=0)
flavor_db = pd.read_csv('../datas/flavordb.csv', names=['id', 'entry_id', 'alias', 'synonyms', 'scientific_name', 'category', 'molecules'], header=0)
flavor_db.drop(['id'], axis=1, inplace=True)

In [19]:
# Make all list of all flavors
all_flavors = molecules['flavor_profile'].apply(lambda x: x.replace("'", "").strip('{}').split(', ')).tolist()
unique_flavors = set([flavor for flavors in all_flavors for flavor in flavors])
print(f"{len(unique_flavors)}種類")
print(unique_flavors)

599種類
{'', 'gardenia', 'ambergris', 'alkane', 'cedarleaf', 'gassy', 'rose dried', 'cereal', 'galbanum', 'wax', 'shrimp', 'lemongrass', 'fat', 'cognac', 'coumarin', 'black currant', 'nutty', 'sassafrass', 'chocolate', 'lily', 'thyme', 'slightly waxy', 'celery', 'cotton', 'cassia', 'lavender', 'camomile', 'banana peel', 'green tea', 'cantaloupe', 'fermented', 'civet', 'floral', 'anisic', 'apricot', 'coriander', 'curry', 'acidic', 'jasmine', 'paper', 'cooked', 'bean', 'wet', 'wood', 'terpineol', 'naphthelene', 'red rose', 'mango', 'weak spice', 'chamomile', 'jam', 'kiwi', 'ether', 'plastic', 'vanillin', 'sappy', 'pepper', 'creamy', 'chicken', 'fresh air', 'damascone', 'repulsive', 'hazelnut', 'watercress', 'buttery', 'nut skin', 'alcoholic', 'pulpy', 'wasabi', 'amine', 'paint', 'nutmeg', 'mossy', 'roasted', 'cat-urine', 'acetic', 'fennel', 'rose flower', 'sulfur', 'cadaverous', 'mothball', 'pungent', 'dirty', 'sausage', 'leafy', 'peppery', 'indole', 'saffron', 'smoked', 'powerful', 'winey

In [20]:
molecules.head(3)

Unnamed: 0,id,none,pubchem_id,common_name,flavor_profile
0,0,0.0,4,1-Aminopropan-2-ol,{'fishy'}
1,1,1.0,49,3-Methyl-2-oxobutanoic acid,{'fruity'}
2,2,2.0,58,2-oxobutanoic acid,"{'sweet', 'creamy', 'caramel', 'lactonic', 'br..."


In [21]:
# Make all list of all categories 
unique_categories = set([cate for cate in flavor_db['category']])
print(f"{len(unique_categories)}種類")
print(unique_categories)

34種類
{'dish', 'fish', 'beverage caffeinated', 'vegetable root', 'nut', 'plant derivative', 'bakery', 'herb', 'cereal', 'fruit-berry', 'fungus', 'spice', 'beverage', 'flower', 'meat', 'additive', 'fruit essence', 'vegetable', 'plant', 'beverage alcoholic', 'seafood', 'vegetable fruit', 'vegetable tuber', 'cabbage', 'legume', 'seed', 'berry', 'dairy', 'vegetable stem', 'fruit citrus', 'maize', 'essential oil', 'fruit', 'gourd'}


In [22]:
flavor_db.head(3)

Unnamed: 0,entry_id,alias,synonyms,scientific_name,category,molecules
0,1,bakery products,{'bakery products'},poacceae,bakery,"{27457, 7976, 31252, 26808, 22201, 26331}"
1,2,bread,{'bread'},poacceae,bakery,"{1031, 1032, 644104, 527, 8723, 31260, 15394, ..."
2,3,rye bread,{'rye bread'},rye,bakery,"{644104, 7824, 643731, 8468, 1049, 5372954, 80..."


In [23]:
# Create a matrix of ingredients and molecules
matrix_df = pd.DataFrame(index=flavor_db.index)

# 各行の pubchem_id を保存するリスト
columns_to_add = {}

for index, row in flavor_db.iterrows():
    pubchem_ids = row['molecules'].replace("{", "").replace("}", "").split(", ")
    for pubchem_id in pubchem_ids:
        if pubchem_id not in columns_to_add:
            columns_to_add[pubchem_id] = [0] * len(flavor_db)
        columns_to_add[pubchem_id][index] = 1

# 収集したデータを DataFrame に変換して concat で結合
new_columns_df = pd.DataFrame(columns_to_add)
matrix_df = pd.concat([matrix_df, new_columns_df], axis=1)

matrix_df.fillna(0, inplace=True)  # 空欄を0で埋める
matrix_df

Unnamed: 0,27457,7976,31252,26808,22201,26331,1031,1032,644104,527,...,5281611,7768,16296,7601,21606187,73467,115221,65243,161120,5318557
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
931,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
932,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
933,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# 共起関係を計算
co_occurrence = matrix_df.T.dot(matrix_df)

# ネットワークの作成
G = nx.Graph()

# ノードとエッジを追加
for ingredient in co_occurrence.columns:
    G.add_node(ingredient)

for i, j in itertools.combinations(co_occurrence.columns, 2):
    weight = co_occurrence.at[i, j]
    if weight > 0:  # 共起頻度が0より大きい場合のみエッジを追加
        G.add_edge(i, j, weight=weight)

In [25]:
# ネットワークの描画
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, font_weight='bold')

# エッジのラベル（重み）を表示
labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)

#plt.show()

KeyboardInterrupt: 