In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_chars = pd.read_pickle('./data/characters.pickle')
df_chars.head()

Unnamed: 0,Name,Element,Weapon,Region,Image
0,Albedo,Geo,Sword,Mondstadt,https://static.wikia.nocookie.net/gensin-impac...
1,Aloy,Cryo,Bow,Others,https://static.wikia.nocookie.net/gensin-impac...
2,Amber,Pyro,Bow,Mondstadt,https://static.wikia.nocookie.net/gensin-impac...
3,Arataki Itto,Geo,Claymore,Inazuma,https://static.wikia.nocookie.net/gensin-impac...
4,Barbara,Hydro,Catalyst,Mondstadt,https://static.wikia.nocookie.net/gensin-impac...


In [3]:
url = 'https://genshin-impact.fandom.com/wiki/Spices_From_the_West'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

In [4]:
dishes = soup.select_one('#mw-content-text > div.mw-parser-output > div.columntemplate')

In [5]:
data_dishes = []
for dish in dishes.select('li'):
    el_anchor = dish.select_one('span > a')
    el_img = el_anchor.select_one('img')
    
    data_dish = {
        'name': el_anchor.get('title'),
        'link': urljoin('https://genshin-impact.fandom.com/', el_anchor.get('href')),
        'image': el_img.get('data-src').split('/revision')[0]
    }
    data_dishes.append(data_dish)

In [6]:
df_dishes = pd.DataFrame(data_dishes)

df_dishes.to_csv('./data/spices_dishes.csv')
df_dishes.to_pickle('./data/spices_dishes.pickle')

In [7]:
pref = soup.select_one('#mw-content-text > div.mw-parser-output > table.fandom-table')

In [8]:
data_prefs = []
for el_entry in pref.select('tr')[1:]:
    el_cells = el_entry.select('td')
    
    el_char = el_cells[0]
    el_char_a = el_char.select_one('div.card_image > a')
    
    likes = []
    el_likes = el_cells[1]
    for el_dish in el_likes.select('a'):
        likes.append(el_dish.get('title'))
    
    dislikes = []
    el_dislikes = el_cells[2]
    for el_dish in el_dislikes.select('a'):
        dislikes.append(el_dish.get('title'))
    
    data_pref = {
        'name': el_char_a.get('title'),
        'likes': likes,
        'dislikes': dislikes
    }
    data_prefs.append(data_pref)

In [9]:
data_scores = []
for data_pref in data_prefs:
    name = data_pref['name']
    for like in data_pref['likes']:
        data_scores.append({
            'name': name,
            'dish': like,
            'score': 1
        })
    
    for dislike in data_pref['dislikes']:
        data_scores.append({
            'name': name,
            'dish': dislike,
            'score': -1
        })
    

In [10]:
df_scores = pd.DataFrame(data_scores)

df_scores.to_csv('./data/spices_scores.csv')
df_scores.to_pickle('./data/spices_scores.pickle')

In [11]:
df_vectors = df_scores.groupby(['name', 'dish']).mean()['score']\
            .unstack().fillna(0).astype(int)

In [12]:
sim_chars = pd.DataFrame(
    cosine_similarity(df_vectors),
    index=df_vectors.index.copy(),
    columns=df_vectors.index.copy(), # make sure to use copy!
)

sim_chars.index.name = 'Source'
sim_chars.columns.name = 'Target'

df_sim_chars = sim_chars.stack().reset_index()\
    .rename(columns={0: 'Similarity'})
df_sim_chars = df_sim_chars.loc[~np.isclose(df_sim_chars['Similarity'], 0)].reset_index(drop=True)

df_sim_chars.to_csv('./data/spices_sim_chars.csv')
df_sim_chars.to_pickle('./data/spices_sim_chars.pickle')

In [13]:
sim_dishes = pd.DataFrame(
    cosine_similarity(df_vectors.T),
    index=df_vectors.columns.copy(),
    columns=df_vectors.columns.copy()
)

sim_dishes.index.name = 'Source'
sim_dishes.columns.name = 'Target'

df_sim_dishes = sim_dishes.stack().reset_index()\
    .rename(columns={0: 'Similarity'})
df_sim_dishes = df_sim_dishes.loc[~np.isclose(df_sim_dishes['Similarity'], 0)].reset_index(drop=True)

df_sim_dishes.to_csv('./data/spices_sim_dishes.csv')
df_sim_dishes.to_pickle('./data/spices_sim_dishes.pickle')

In [14]:
# from sklearn import preprocessing, manifold, cluster
# from sklearn.pipeline import make_pipeline
# pipe = make_pipeline(
#     preprocessing.StandardScaler(),
#     manifold.TSNE(
#         n_components=2, 
#         init='pca', 
#         learning_rate='auto',
#         random_state=42
#     )
# )
# embed_chars_array = pipe.fit_transform(df_vectors)
# df_vectors_std = preprocessing.StandardScaler().fit_transform(df_vectors)
# ap = cluster.AffinityPropagation(
#     random_state=42
# ).fit(df_vectors_std)
# from sklearn.cluster import AgglomerativeClustering
# AgglomerativeClustering().fit(df_vectors).labels_
# df_chars = df_chars.merge(
#     pd.DataFrame(
#         pd.Series(ap.labels_, index=df_vectors.index, name='Group')
#     ),
#     left_on='Name',
#     right_index=True
# )
# embed_chars = pd.DataFrame(
#     embed_chars_array,
#     index=df_vectors.index,
#     columns=['x', 'y']
# )

In [15]:
# from pyvis.network import Network
# def get_node_options(char):
#     entries = df_chars[df_chars["Name"] == char]
#     if len(entries) > 0:
#         entry = entries.iloc[0]

#         node_options = {
#             "size": 40,
#             "shape": "circularImage",
#             "image": entry["Image"],
#             "group": int(entry['Group']),
#         }

#         return node_options
#     else:
#         return {
#             "size": 40,
#             "shape": "circularImage",
#             "image": f"https://ui-avatars.com/api/?rounded=true&bold=true&size=512&format=png&name={char}",
#         }
# if platform.processor():  # local
#     path = "./public"
#     if not os.path.exists(path):
#         os.mkdir(path)
# else:  # cloud
#     path = "/tmp"
# net = Network(height="600px", width="100%", directed=False)
# for idx, row in embed_chars.iterrows():
#     net.add_node(
#         idx, 
#         x=float(row['x']), y=float(row['y']), 
#         label=idx, 
#         **get_node_options(idx)
#     )
# net.toggle_drag_nodes(False)
# net.show(f"{path}/food_preferences.html")