In [1]:
import os
import re

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

tqdm.pandas()

## Character List

In [2]:
char_resp = requests.get('https://genshin-impact.fandom.com/wiki/Characters/List')
char_soup = BeautifulSoup(char_resp.text, 'lxml')

In [3]:
char_elements = char_soup.select(
    '#mw-content-text > div.mw-parser-output > table.article-table'
)[:2]

In [4]:
df_chars = []
chars = []

for char_element in char_elements:
    images = []
    for entry in char_element.select('tr')[1:]:
        img = entry.select_one('td > a > img')
        img_url = img.get('data-src', '') or img.get('src')
        images.append(
            img_url.split('/revision')[0]
        )
    df_char = pd.read_html(char_element.prettify())[0]
    df_char = df_char.drop(['Icon', 'Rarity'], axis=1)
    df_char['Image'] = images

    df_chars.append(df_char)
    chars.append(df_char['Name'].tolist())

In [5]:
df_chars = pd.concat(df_chars, axis=0).reset_index(drop=True)
df_chars = df_chars.fillna('Others')

In [6]:
chars = chars[0]

## Voice-Overs

In [7]:
rename_map = {
    'Us': 'Traveler',
    'us': 'Traveler',
    'Baal': 'Raiden Shogun',
    'Lady with Fox Ears': 'Yae Miko',
    'Morax': 'Zhongli',
    'The Anemo Archon': 'Venti',
    'Zhongli  A New Star Approaches': 'Zhongli', # wtf
    'Itto': 'Arataki Itto',

    'the Vision': None,
    'the  Vision': None,
    'Vision': None,
}

rename_map_2 = {
    ('Albedo', 'Family'): 'Rhinedottir',
    ('Klee', 'the Parents'): 'Alice',
    ('Klee', 'Parents'): 'Alice',
    ('Diona', 'Her Father'): 'Draff',
    ('Rosaria', 'Her Father'): "Rosaria's Father"
}

results = []

In [8]:
for char in tqdm(chars):
    if char == 'Traveler':
        continue

    voice_url = 'https://genshin-impact.fandom.com/wiki/{}/Voice-Overs'.format(
        char.replace(' ', '_')
    )
    voice_resp = requests.get(voice_url)
    voice_soup = BeautifulSoup(voice_resp.text, 'lxml')
    voice_el = voice_soup.select('#mw-content-text > div.mw-parser-output > table.wikitable')[0]
    df_voice = pd.read_html(voice_el.prettify())[0]

    for idx, row in df_voice.iterrows():
        title = row.iloc[0]
        details = row.iloc[1]

        title_1 = re.sub(
            r' *?Friendship.+$',
            '',
            title
        )

        title_2 = re.search(
            r'^(More )*About (the )*(.+)',
            title_1
        )

        if not title_2:
            continue

        title_3 = title_2.groups()[2].split(':')[0].strip()

        target = title_3

        if not target:
            continue

        details_1 = re.sub(
            r'^Media.+ogg *',
            '',
            details
        )
        text = details_1.strip()

        target = rename_map.get(target, target)
        if (char, target) in rename_map_2:
            target = rename_map_2[(char, target)]

        results.append({
            'Source': char,
            'Target': target,
            'Text': text
        })

  0%|          | 0/49 [00:00<?, ?it/s]

In [9]:
df_voices = pd.DataFrame(results)

In [10]:
df_text = df_voices.groupby(['Source', 'Target']).agg('\n'.join).reset_index()

In [11]:
df_text

Unnamed: 0,Source,Target,Text
0,Albedo,Albedo,Genius? ...A number of people call me that. Bu...
1,Albedo,Barbara,Barbara? She's a very earnest Deaconess. I onc...
2,Albedo,Jean,Jean? She has served exceptionally well as Act...
3,Albedo,Klee,"Indeed, I view Klee as my younger sister. When..."
4,Albedo,Lisa,Lisa? I always am impressed by the valuable pe...
...,...,...,...
521,Zhongli,Traveler,Gold is Liyue's treasure. It is the blood that...
522,Zhongli,Venti,Tsk... It reeks of wine. That bard has just be...
523,Zhongli,Xiao,You speak of the young adeptus of Guili Plain...
524,Zhongli,Yanfei,Despite never having signed a contract with me...


## Sentiment Analysis

In [12]:
from flair.models import TextClassifier
from flair.data import Sentence

sia = TextClassifier.load('en-sentiment')

2022-06-13 22:42:23,558 loading file /Users/miraclexyz/.flair/models/sentiment-en-mix-distillbert_4.pt


In [13]:
def predict_sentiment(text):
    sentence = Sentence(text.loc['Text'])
    sia.predict(sentence)
    label = sentence.labels[0]
    return label.value, label.score

In [14]:
df_text[['Value', 'Score']] = df_text[['Text']].progress_apply(
    predict_sentiment,
    axis=1,
    result_type='expand'
)

  0%|          | 0/526 [00:00<?, ?it/s]

In [15]:
df_text['Sentiment'] = df_text[['Value', 'Score']].apply(
    lambda s: s.Score if s.Value == 'POSITIVE' else 1 - s.Score,
    axis=1
)
df_text['Sentiment'] = 2 * df_text['Sentiment'] - 1

In [16]:
if not os.path.exists('data'):
    os.mkdir('data')

df_chars.to_csv('./data/characters.csv')
df_chars.to_pickle('./data/characters.pickle')
df_text.to_csv('./data/voice_text.csv')
df_text.to_pickle('./data/voice_text.pickle')