In [2]:
from datasets import load_dataset
import soundfile as sf
import os
from tqdm import tqdm
from collections import defaultdict


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Instead of downloading the entirety of the 200GB dataset, we are going to stream the dataset! This uses hugging face and AWS
dataset = load_dataset('simon3000/genshin-voice', split='train', streaming=True)

In [None]:
# Filter dataset to relevant rows
def filter_dataset(dataset):
    for entry in dataset:
        # Only keep rows that are not NPCs or that are in the "English Language"
        if 'vo_npc' not in entry['inGameFilename'].lower() and entry['language'] == "English(US)":
            yield entry

filtered_dataset = filter_dataset(dataset)

In [None]:
# Initialize a default dictionary for character counts
character_counts = defaultdict(int)

# Goes through each row of the filtered dataset
for voice in tqdm(filtered_dataset, desc="Processing voices", unit="file"):
    # identify the character for the row
    speaker = voice['speaker']

    # We only keep 100 wav files for each character
    if character_counts[speaker] < 100:
        # create a character folder if needed
        character_folder = os.path.join("characters", speaker)
        os.makedirs(character_folder, exist_ok=True)

        audio_path = os.path.join(character_folder, f'{character_counts[speaker]}_audio.wav')

        # create the audio file in the appropriate character folder
        sf.write(audio_path, voice['audio']['array'], voice['audio']['sampling_rate'])

        # add one to the number of files a character has 
        character_counts[speaker] += 1

        print(f'{character_counts[speaker]} audio files for {speaker} done')

    if character_counts[speaker] == 100:
        continue

In [None]:
directory = '/content/characters'

items = os.listdir(directory)

directories = [item for item in items if os.path.isdir(os.path.join(directory, item))]
sorted(directories)

In [None]:
playable = [
    "Albedo",
    "Alhaitham",
    "Aloy",
    "Amber",
    "Arataki Itto",
    "Baizhu",
    "Barbara",
    "Beidou",
    "Bennett",
    "Candace",
    "Chongyun",
    "Collei",
    "Cyno",
    "Dehya",
    "Diluc",
    "Diona",
    "Dori",
    "Eula",
    "Faruzan",
    "Fischl",
    "Freminet",
    "Ganyu",
    "Gorou",
    "Hu Tao",
    "Jean",
    "Kaedehara Kazuha",
    "Kaeya",
    "Kamisato Ayaka",
    "Kamisato Ayato",
    "Kaveh",
    "Keqing",
    "Kirara",
    "Klee",
    "Kujou Sara",
    "Kuki Shinobu",
    "Layla",
    "Lisa",
    "Lynette",
    "Lyney",
    "Mika",
    "Mona",
    "Nahida",
    "Nilou",
    "Ningguang",
    "Noelle",
    "Paimon",
    "Qiqi",
    "Raiden Shogun",
    "Razor",
    "Rosaria",
    "Sangonomiya Kokomi",
    "Sayu",
    "Shenhe",
    "Shikanoin Heizou",
    "Sucrose",
    "Tartaglia (Childe)",
    "Thoma",
    "Tighnari",
    "Traveler (Anemo)",
    "Traveler (Geo)",
    "Traveler (Electro)",
    "Traveler (Dendro)",
    "Venti",
    "Wanderer",
    "Xiangling",
    "Xiao",
    "Xingqiu",
    "Xinyan",
    "Yae Miko",
    "Yanfei",
    "Yaoyao",
    "Yelan",
    "Yoimiya",
    "Yun Jin",
    "Zhongli",
    "Charlotte",
    "Navia",
    "Clorinde",
    "Wriothesley",
    "Neuvillette",
    "Furina",
    "Freminet"
]


In [None]:
lst = []

In [None]:
for i in directories:
  for j in playable:
    if i.lower() in j.lower():
      lst.append(i)

In [None]:
# a list of valid characters for our model to predict (aka the playable ones)
directory_lst = list(set(lst))
len(lst)

In [1]:
import shutil

In [None]:
content_dir = '/content/characters'
all_directories = [d for d in os.listdir(content_dir) if os.path.isdir(os.path.join(content_dir, d))]
directories_to_delete = [d for d in all_directories if d not in directory_lst]
# remove all directories that are not playable characters
for dir_name in directories_to_delete:
    dir_path = os.path.join(content_dir, dir_name)
    shutil.rmtree(dir_path)

In [None]:
# get the total size of the character directory
def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

all_directories = [d for d in os.listdir(content_dir) if os.path.isdir(os.path.join(content_dir, d))]

total_size = 0
for dir_name in all_directories:
    dir_path = os.path.join(content_dir, dir_name)
    size = get_directory_size(dir_path)
    total_size += size
    print(f"Directory: {dir_path} - Size: {size} bytes")

print(f"Total size of all directories in content folder: {total_size} bytes ({total_size / (1024**2):.2f} MB)")
# We end up with a size of 5GB which is significantly smaller than the 200GB dataset


In [None]:
from google.colab import files

folder_path = '/content/characters'

zip_file = '/content/characters.zip'

shutil.make_archive('/content/characters', 'zip', folder_path)

files.download(zip_file)
#download the entire zip folder of our characters 