In [1]:
import os
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from io import StringIO
from transformers import pipeline
from torch.utils.data import Dataset

tqdm.pandas()

## Character List


In [2]:
char_resp = requests.get(
    'https://genshin-impact.fandom.com/wiki/Characters/List')
char_soup = BeautifulSoup(char_resp.text, 'lxml')

If some characters are not showing up, purge the page by navigating to: https://genshin-impact.fandom.com/wiki/Characters/List?action=purge


In [3]:
char_elements = char_soup.select(
    '#mw-content-text > div.mw-parser-output > table.article-table')[:2]

In [4]:
df_chars = []
chars = []

for char_element in char_elements:
    images = []
    for entry in char_element.select('tr')[1:]:
        img = entry.select_one('td > a > img')
        img_url = img.get('data-src', '') or img.get('src')
        images.append(img_url.split('/revision')[0])
    df_char = pd.read_html(StringIO(char_element.prettify()))[0]
    df_char = df_char.drop(['Icon', 'Quality'], axis=1)
    df_char['Image'] = images

    df_chars.append(df_char)
    chars.append(df_char['Name'].tolist())

In [5]:
df_chars = pd.concat(df_chars, axis=0).reset_index(drop=True)
df_chars = df_chars.fillna('Others')

In [6]:
chars = chars[0] + chars[1]

## Voice-Overs


In [7]:
rename_map = {
    'Us': 'Traveler',
    'us': 'Traveler',
    'Baal': 'Raiden Shogun',
    'Lady with Fox Ears': 'Yae Miko',
    'Morax': 'Zhongli',
    'Rex Lapis': 'Zhongli',
    'The Anemo Archon': 'Venti',
    'the Anemo Archon': 'Venti',
    'Lesser Lord Kusanali': 'Nahida',
    'Zhongli  A New Star Approaches': 'Zhongli',  # wtf
    'Itto': 'Arataki Itto',
    'Collei Herself': 'Collei',
    'Buer': 'Nahida',
    'Focalors': 'Furina',
    'Anemo Archon': 'Venti',
    'Geo Archon': 'Zhongli',
    'Electro Archon': 'Raiden Shogun',
    'Dendro Archon': 'Nahida',
    'Cloud Retainer': 'Xianyun',
    'the Vision': None,
    'the  Vision': None,
    'Vision': None,
    'Kunikuzushi': 'Wanderer',  # Yae Miko

    # Fatui Harbingers
    'The Jester': 'Pierro',
    'The Captain': 'Il Capitano',
    'The Doctor': 'Il Dottore',
    'Damselette': 'Columbina',
    'The Knave': 'Arlecchino',
    'The Rooster': 'Pulcinella',
    'Marionette': 'Sandrone',
    'Regrator': 'Pantalone',
    'The Fair Lady': 'La Signora',
    'The Balladeer': 'Wanderer',
    'The Balladeer [Note 1]': 'Wanderer',
    'The Balladeer  [Note 1]': 'Wanderer',
    'Childe': 'Tartaglia',
    '(Wanderer)': 'Wanderer'
}

rename_map_2 = {
    ('Albedo', 'Family'): 'Rhinedottir',
    ('Klee', 'the Parents'): 'Alice',
    ('Klee', 'Parents'): 'Alice',
    ('Diona', 'Her Father'): 'Draff',
    ('Rosaria', 'Her Father'): "Rosaria's Father",
    ('Lyney', '"Father"'): 'Arlecchino',
    ('Lynette', '"Father"'): 'Arlecchino',
    ('Freminet', '"Father"'): 'Arlecchino',
}

rename_map_3 = {
    ('Nahida', 'The Seven'): ['Venti', 'Zhongli', 'Raiden Shogun', 'Furina'],
    ('Neuvillette', 'Lyney and Lynette'): ['Lyney', 'Lynette'],
    ('Neuvillette', 'Lyney  and  Lynette'): ['Lyney', 'Lynette'],
    ('Furina', 'Lyney and Lynette'): ['Lyney', 'Lynette'],
    ('Furina', 'Lyney  and  Lynette'): ['Lyney', 'Lynette'],
    ('Navia', 'Lyney and Lynette'): ['Lyney', 'Lynette'],
    ('Navia', 'Lyney  and  Lynette'): ['Lyney', 'Lynette'],
}

results = []

In [8]:
for char in tqdm(chars):
    if char == 'Traveler':
        continue

    voice_url = 'https://genshin-impact.fandom.com/wiki/{}/Voice-Overs'.format(
        char.replace(' ', '_'))

    retry_count = 0
    available = False
    while not available and retry_count < 10:
        try:
            voice_resp = requests.get(voice_url)
            available = True
        except Exception as e:
            print(e)
            time.sleep(5)
            retry_count += 1

    voice_soup = BeautifulSoup(voice_resp.text, 'lxml')
    try:
        voice_el = voice_soup.select(
            '#mw-content-text > div.mw-parser-output > table.wikitable')[0]
        df_voice = pd.read_html(StringIO(voice_el.prettify()))[0].fillna('')
    except Exception as e:
        print(char, e)
        continue

    for idx, row in df_voice.iterrows():
        df_voice.columns = ['Title', 'Details']
        # print(df_voice)

        title = row.iloc[0]
        details = row.iloc[1]

        # add "Hello" voice lines
        if title == 'Hello':
            title = 'About Us'

        title_1 = re.sub(r' *?Friendship.+$', '', title)

        title_2 = re.search(r'^(More )*About (the )*(.+)', title_1)

        if not title_2:
            continue

        title_3 = title_2.groups()[2].split(':')[0].strip()

        target = title_3

        if (not target) or (target == '[[]]'):
            continue

        if pd.isnull(details):
            continue

        details_1 = re.sub(r'^Media.+ogg *', '', details)
        text = details_1.strip()

        target = rename_map.get(target, target)
        if (char, target) in rename_map_2:
            target = rename_map_2[(char, target)]

        if (char, target) in rename_map_3:
            targets = rename_map_3[(char, target)]
            for each_target in targets:
                results.append({
                    'Source': char,
                    'Target': each_target,
                    'Text': text
                })
        else:
            results.append({'Source': char, 'Target': target, 'Text': text})

  0%|          | 0/80 [00:00<?, ?it/s]

Chevreuse list index out of range
Dainsleif list index out of range
Gaming list index out of range
Xianyun list index out of range


In [9]:
df_voices = pd.DataFrame(results)

In [10]:
df_intro = pd.read_csv('./data/intro_text.csv')
df_voices = pd.concat([df_intro, df_voices], axis=0).reset_index(drop=True)
df_voices.head()

Unnamed: 0,Source,Target,Text
0,A Merchant Whose Caravan Was Safely Escorted b...,Gaming,"He sees and hears everything, as if he had the..."
1,Madame Ping,Xianyun,"A new resident in the city, you say? Oh, it's..."
2,Grosrochard,Chevreuse,"...Captain Chevreuse, once again I implore you..."
3,Clorinde,Navia,"When we were young, we used to play a kind of ..."
4,Chevreuse,Charlotte,"...Euphrasie, three days ago, one of your jour..."


In [11]:
df_text = df_voices.groupby(['Source', 'Target']).agg('\n'.join).reset_index()
df_text = df_text[df_text['Text'].str.strip() != ''].reset_index(drop=True)

## Sentiment Analysis


In [12]:
# TODO: use sliding-window approach for long text (>512 tokens)
classifier = pipeline("sentiment-analysis", truncation=True, device="mps")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [13]:
class ListDataset(Dataset):

    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i):
        return self.original_list[i]


text_dataset = ListDataset(df_text['Text'].tolist())

In [14]:
sentiment_results = [text for text in tqdm(classifier(text_dataset))]
df_text['Label'] = [x['label'] for x in sentiment_results]
df_text['Score'] = [x['score'] for x in sentiment_results]

  0%|          | 0/885 [00:00<?, ?it/s]

In [15]:
df_text['Sentiment'] = df_text[['Label', 'Score']].apply(
    lambda s: s.Score if s.Label == 'POSITIVE' else 1 - s.Score, axis=1)
df_text['Sentiment'] = 2 * df_text['Sentiment'] - 1

In [16]:
if not os.path.exists('data'):
    os.mkdir('data')

df_chars.to_csv('./data/characters.csv')
df_chars.to_pickle('./data/characters.pickle')
df_text.to_csv('./data/voice_text.csv')
df_text.to_pickle('./data/voice_text.pickle')