In [1]:
import pandas as pd
import time
import re
import html
import lxml
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
character_list_url = 'https://dissidiacompendium.com/characters/?'

In [3]:
driver = webdriver.Chrome()

NoSuchDriverException: Message: Unable to obtain chromedriver using Selenium Manager; Message: Unsuccessful command executed: C:\Users\jasre\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\common\windows\selenium-manager.exe --browser chrome --output json.
error sending request for url (https://chromedriver.storage.googleapis.com/LATEST_RELEASE_115): error trying to connect: dns error: No such host is known. (os error 11001)
; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


# Next cells contain function definitions just to save time.

I'll organize these differently once I'm done developing this part of the scraper.

In [None]:
def generate_character_links(character_list_url):
    """

    Generates a dictionary with character names as the keys. Values for these keys are
    dictionaries, which contain links to the character's profile, ability, buff,
    high armor, and high armor plus pages.

    """

    driver.get(character_list_url)
    
    character_link_list = WebDriverWait(
        driver,
        timeout=10
    ).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "characterlink"))
    )
    
    character_dict_omnibus = {}
    
    for char_link in character_link_list:
        char_name = str(char_link.get_attribute("href").split('/')[-1])
        link_to_profile = str(char_link.get_attribute("href"))
        link_to_abilities = str(f"https://dissidiacompendium.com/characters/{char_name}/abilities?")
        link_to_buffs = str(f"https://dissidiacompendium.com/characters/{char_name}/buffs?")
        link_to_ha = str(f"https://dissidiacompendium.com/characters/{char_name}/gear?7A=true")
        link_to_ha_plus = str(f"https://dissidiacompendium.com/characters/{char_name}/gear?7APlus=true")
        
        char_dict = {
                'profile_url': link_to_profile,
                'abilities_url': link_to_abilities,
                'buffs_url': link_to_buffs,
                'high_armor_url': link_to_ha,
                'high_armor_plus_url': link_to_ha_plus
        }
        
        character_dict_omnibus[char_name] = char_dict

    return character_dict_omnibus

In [None]:
def generate_ability_dict(
    link_to_char_ability_page,  # Link to character's ability page
    scroll_speed = 1000,  # Scrolling speed to move through the page for lazy loading
    verbose = False  # If true, will return print statements on iterations
):
    """

    Parses a character's ability page to college abilities with HP attacks in them. The function
    returns an ability dictionary, where the keys are the ability names in human-readable
    format, and the values are the <div> block containing the number of BRV attacks, HP
    attacks, buffs granted, attack attributes, etc. of the ability

    """

    # driver.get(character_dict_omnibus['astos']['abilities_url'])
    driver.get(link_to_char_ability_page)

    time.sleep(5)
    
    
    try:
        list_build_complete = False
        
        count = 0
        
        while list_build_complete == False:
            
            driver.execute_script(f"window.scrollBy(0, {scroll_speed});")
            time.sleep(1)
            ability_list = driver.find_elements(By.XPATH, "//div[@class='infotitle abilitydisplayfex ']")
            
            # The last two abilities are calls. So, the second to last ability should be a call when we're done.
            match = re.search('\(C\)', ability_list[-2].text)
            list_build_complete = True if match else False
            
            if verbose:
                print(f"This iteration caught {len(ability_list)} abilities.")
            
                print('-----------')
            count += 1
            if count == 15:
                print("Too many iterations. Examine this function for:")
                print(link_to_char_ability_page)
                break
        
        if verbose:
            print(f"This took {count} iterations.\n")
        
            for ability in ability_list:
                print(ability.text)
    
        ability_info_list = driver.find_elements(By.XPATH, "//div[@class='bluebase abilityinfobase']")
    
        if verbose:
            print(f"Collected ability info list")
        
        ability_dict = {}
        
        count = 0
        
        for ability in ability_list:
            ability_name = str(ability_list[count].text.split(' - ')[0])
            
            ability_dict[ability_name] = ability_info_list[count]
            count += 1
    
        ability_dict['char_name'] = link_to_char_ability_page.split('/')[-2]
        print('Added char name to ability dict')
        
        return ability_dict
    except:
        print("Unable to access abilities for a character. Maybe not on GL yet.")
        return None

In [None]:
def prettify_html_to_list(html_string):
    """

    Parses a string of HTML (such as the results of .get_attribute('outerHTML') from Selenium) and 
    returns a list, where each list element is a line of the prettified HTML. 

    """
    
    soup = BeautifulSoup(html_string, 'lxml')

    return [line for line in soup.prettify().split('\n')]

In [None]:
def extract_ability_hp_attack_count(ability_dictionary):
    """

    Extracts the number of HP attacks dealt to an ability's main target and non-targets. The
    function input should be a value from an ability_dictionary (i.e., the div block corresponding 
    with the ability). 

    Returns a pandas dataframe with the ability name, number of HP attacks into main targets, 
    and number of HP attacks into non-targets.

    """

    if not ability_dictionary:
        print("No ability_dictionary available/provided.")
        return None
    
    df_row_list = []
    
    for ability_name, ability_div in ability_dictionary.items():
    
        if ability_name == 'char_name':
            continue
        
        ability_html_lines = prettify_html_to_list(ability_div.get_attribute('outerHTML'))
    
        row_dict = {}

        row_dict['ability_name'] = ability_name
        
        main_target_hp_attacks = 0
        non_target_hp_attacks = 0
        hp_dmg_cap_up_perc = 0
    
        for index, line in enumerate(ability_html_lines):
            
            # Extract HP Dmg Cap within ability and/or from FE
            
            if re.search("- MAX BRV Cap", line):
                hp_dmg_cap_up_perc += int(ability_html_lines[index + 6].strip().replace('%', ''))
            
            if re.search("MAX BRV Cap Up by", line):
                hp_dmg_cap_up_perc += int(ability_html_lines[index + 2].strip().replace('%', ''))
            
            # "inline HP" is class for the HP Attack icon
            if "inline HP" not in line:
                continue
            
            # Info on single-target vs group attack appears on preceding line and/or 3 lines before
            
            single_or_group_lines = ability_html_lines[index - 1] + ability_html_lines[index - 3] + ability_html_lines[index + 2]
            
            AOE = True if re.search(r"Group", single_or_group_lines) else False

            # Sometimes an inline appears after the inline(s) we care about to describe the source 
            # of the HP damage. We want to skip these instances.
            if re.search(r"Attack", ability_html_lines[index - 2]):
                continue
            
            # Info on HP attack count and type appears two lines later (e.g., Attack 3 times,
            # Damage to non-targets after each HP attack, etc.), with a few exceptions.

            # I hate hard coding an ability name like this, but we'll see if I can make it more
            # programmatic later, or make a list of abilities that operate with this format.
            
            if ability_name == 'Crystal Generation':
                attack_info_line = ability_html_lines[index + 6]
            else:
                attack_info_line = ability_html_lines[index + 2]

            extra_condition_line = ability_html_lines[index + 6]

            # Some abilities deal damage based on a stored value (e.g., Aerith BT effect, Astos)
            # For these abilities, the line we want appears eleven lines later
            if re.search(r"Damage by", attack_info_line) and re.search(r"of stored value from", extra_condition_line):
                attack_info_line = ability_html_lines[index + 11]
            
            # Other abilities deal damage based on a characters stat or current value (e.g., Aerith's LD followup)
            # For these abilities, the line we want appears six lines later
            if re.search(r"Damage ", attack_info_line) and re.search(r"of ", extra_condition_line):
                attack_info_line = ability_html_lines[index + 6]
        
            hp_attacks_to_add = 0
            add_to_non_target = 0
            copy_st_to_aoe = False
        
            if re.search(r"Damage to non-targets after each HP Attack", attack_info_line):
                copy_st_to_aoe = True
            elif re.search(r"Group \d+", attack_info_line):
                AOE = True
                hp_attacks_to_add = int(re.search(r"Group \d+ times", attack_info_line).group().split(' ')[1])
            elif re.search(r"Group", attack_info_line):
                AOE = True
                hp_attacks_to_add = 1
            elif re.search(r"to non-targets × \d+", attack_info_line):
                add_to_non_target = int(re.search(r"× \d+", attack_info_line).group().split(' ')[1])
            elif re.search(r"to non-targets \d+ times", attack_info_line):
                add_to_non_target = int(re.search(r"\d+ times", attack_info_line).group().split(' ')[0])
            elif re.search(r"to non-targets", attack_info_line):
                add_to_non_target = 1
            elif re.search(r"\d+ times", attack_info_line):
                hp_attacks_to_add = int(re.search("\d+ times", attack_info_line).group().split(' ')[0])
            else:
                hp_attacks_to_add = 1
        
            if AOE:
                main_target_hp_attacks += hp_attacks_to_add
                non_target_hp_attacks += hp_attacks_to_add
            elif copy_st_to_aoe:
                non_target_hp_attacks = main_target_hp_attacks
            else:
                main_target_hp_attacks += hp_attacks_to_add
                non_target_hp_attacks += add_to_non_target
        
        row_dict['main_target_hp_attacks'] = main_target_hp_attacks
        row_dict['non_target_hp_attacks'] = non_target_hp_attacks
        row_dict['hp_dmg_cap_up_perc'] = hp_dmg_cap_up_perc

        df_row_list.append(row_dict)

    ability_df = pd.DataFrame(df_row_list)

    ability_df['char_name'] = ability_dictionary['char_name']
    
    filtered_df = ability_df[~ability_df['ability_name'].str.contains('(C)', regex = False)].query(
        'main_target_hp_attacks > 0'
    ).reset_index(drop = True)

    return filtered_df[['char_name', 'ability_name', 'main_target_hp_attacks', 'non_target_hp_attacks', 'hp_dmg_cap_up_perc']]

In [None]:
def generate_test_case_ability_dfs(list_of_character_names):
    """

    Save out characters whose HP attack counts we're confident in. We can then use these
    test cases to see whether we've broken a previous character's df when we make changes.

    """

    for t_case in list_of_character_names:
        ability_dict = generate_ability_dict(character_link_dict[t_case]['abilities_url'])
    
        df = extract_ability_hp_attack_count(ability_dict)

        print(t_case.upper(), "test case df")
        display(df)
    
        df.to_csv(
            f"C:\\Users\\jasre\\Code\\dffoo-data-pipeline\\character_ability_test_cases\\{t_case}_ability_df.csv",
            index = False
        )

In [9]:
def test__recent_changes_have_not_altered_previous_ability_dfs(list_of_character_names):
    """

    Compares a newly-generated ability df to one that was generated in the past to see if
    recent changes have broken functionality for a previously-completed character.

    Accepted characters for now: 
        ['auron', 'sherlotta', 'aerith', 'lenna', 'warrioroflight', 'astos', 'paine']

    """
    
    broken_characters_list = []
    
    for t_case in list_of_character_names:
        new_ability_dict = generate_ability_dict(character_link_dict[t_case]['abilities_url'])

        new_df = extract_ability_hp_attack_count(new_ability_dict)

        try:
            old_df = pd.read_csv(
                f"C:\\Users\\jasre\\Code\\dffoo-data-pipeline\\character_ability_test_cases\\{t_case}_ability_df.csv"
            )
        except:
            print(f"Could not load a previous ability_df for {t_case.title()}.")
            print("Are you sure one was previously generated?")

            continue

        if len(old_df.compare(new_df)) > 0:
            broken_characters_list.append(t_case)

    if len(broken_characters_list) > 0:
        print("Broken ability_dfs were found.\n Returning list of characters to review.")
        return broken_characters_list
    else:
        print("No broken ability_dfs!")

# Goals for Scraper

- Pull character list
- Extract data from character sheet
- Organize data into sensible datasets for later use

## Pull Character List
Done! :)

## Extract Data from Character Sheet
1. Navigate to character's sheet
2. Pull data on all their attacks
3. Pull data on their HA
4. Pull data on their BT effect
6. Pull data on their FE

## Organize data into sensible datasets for later use
1. Character-Level
    1. BT Effect HP Cap Up
    2. HA HP Cap Up
2. Attacks (for each: # of split AoE attacks, full AoE attacks, ST attacks, and HP Cap Additions -- this will require info from FE)
   1. BRV(+) Attacks
   2. HP(+) Attacks
   3. S1
   4. S2
   5. EX
   6. LD
   7. BT
   8. FR

# Other Miscellaneous Notes
- For characters with a rework, parse their GL profile first. It doesn't look like you can easily toggle back to GL from JP.
- The best thing to do is probably to parse all of GL first, and then go back and parse all of JP.
- Write something that will check whether each character has a JP rework, and if it does, add them to a list of characters to parse through for reworks.

In [10]:
character_link_dict = generate_character_links(character_list_url)

In [36]:
ability_dict = generate_ability_dict(character_link_dict['lenna']['abilities_url'])

df = extract_ability_hp_attack_count(ability_dict)

df

Added char name to ability dict


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,lenna,HP Attack+,1,0,0
1,lenna,Dragon Breath,3,3,10
2,lenna,Rapid Fire,3,3,20
3,lenna,Rapid Fire+,3,3,20
4,lenna,Wind Drake Arrow,1,1,20
5,lenna,Wind Drake Arrow+,1,1,20
6,lenna,Brave Phoenix,4,4,10
7,lenna,Goliath Tonic & Dragon Breath,3,3,15


In [29]:
prettify_html_to_list(ability_dict['Black Sky'].get_attribute('outerHTML'))

['<html>',
 ' <body>',
 '  <div class="bluebase abilityinfobase">',
 '   Grants 2 levels of',
 '   <img class="inline-buff" src="https://dissidiacompendium.com/images/static/icons/buff/buff_up2_framed.png"/>',
 '   <span class="unique">',
 '    [Dressphere]',
 '   </span>',
 '   for 6 turns',
 '   <br/>',
 '   3-Hit Group',
 '   <span class="inline Ranged">',
 '   </span>',
 '   <span class="inline BRV">',
 '   </span>',
 '   +',
 '   <span class="inline HP">',
 '   </span>',
 '   Attack 5 times',
 '   <br/>',
 '   BRV potency of',
 '   <span class="values">',
 '    80%',
 '   </span>',
 '   per hit',
 '   <br/>',
 '   Increases BRV damage to single targets by',
 '   <span class="values">',
 '    60%',
 '   </span>',
 '   <br/>',
 '   Deals Weakness BRV damage to',
 '   <img class="inline-buff" src="https://dissidiacompendium.com/images/static/icons/buff/debuff_persondown_framed.png"/>',
 '   <span class="unique">',
 '    [Status Break]',
 '   </span>',
 '   inflicted targets by',
 '  

In [41]:
test_case_list = ['auron', 'sherlotta', 'aerith', 'lenna', 'warrioroflight', 'astos', 'paine']

generate_test_case_ability_dfs(test_case_list)

Added char name to ability dict
AURON test case df


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,auron,HP Attack+,1,1,0
1,auron,Banishing Blade,3,1,20
2,auron,Purgatory,2,2,20
3,auron,Weak Damage Up Crush,1,0,0
4,auron,Dragon Fang,4,4,10
5,auron,Tornado,4,4,15
6,auron,Counter,3,3,10


Added char name to ability dict
SHERLOTTA test case df


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,sherlotta,Hurl Staff,1,0,0
1,sherlotta,Hurl Staff+,1,1,0
2,sherlotta,Chuck Staff,4,4,0
3,sherlotta,Crystal Generation,5,5,20
4,sherlotta,Crystal Ray,4,4,15
5,sherlotta,Final Crystal Core,2,2,400
6,sherlotta,Crystal Dice,4,4,260


Added char name to ability dict
AERITH test case df


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,aerith,HP Attack+,1,0,0
1,aerith,HP Attack++,1,0,0
2,aerith,HP Attack+++,1,0,0
3,aerith,HP Attack (Seal Evil),3,3,0
4,aerith,Fury Brand Follow Up,7,0,10
5,aerith,Seal Evil,5,5,15
6,aerith,Additional attack from Seal Evil,2,2,15
7,aerith,Holy,7,7,400
8,aerith,Additional attack from White Materia's Brilliance,1,1,0


Added char name to ability dict
LENNA test case df


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,lenna,HP Attack+,1,0,0
1,lenna,Dragon Breath,3,3,10
2,lenna,Rapid Fire,3,3,20
3,lenna,Rapid Fire+,3,3,20
4,lenna,Wind Drake Arrow,1,1,20
5,lenna,Wind Drake Arrow+,1,1,20
6,lenna,Brave Phoenix,4,4,10
7,lenna,Goliath Tonic & Dragon Breath,3,3,15


Added char name to ability dict
WARRIOROFLIGHT test case df


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,warrioroflight,HP Attack+,1,0,0
1,warrioroflight,HP Attack++,1,0,0
2,warrioroflight,Shining Shield Follow Up,6,0,20
3,warrioroflight,Throw Buckler,4,4,20
4,warrioroflight,Shining Wave,4,4,10
5,warrioroflight,Bitter End,6,1,7
6,warrioroflight,Ultimate Shield,2,2,400
7,warrioroflight,Soul of Light,4,4,260


Added char name to ability dict
ASTOS test case df


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,astos,HP Attack++,1,0,0
1,astos,Cremation,1,1,10
2,astos,HP Attack+,1,0,0
3,astos,Dark Thrust,10,0,20
4,astos,Dark Slash,5,5,20
5,astos,Dark Warhammer,9,1,10
6,astos,Dark Glory,7,7,15
7,astos,Flare Star,2,2,350
8,astos,Conflict Ultima,4,4,260


Added char name to ability dict
PAINE test case df


Unnamed: 0,char_name,ability_name,main_target_hp_attacks,non_target_hp_attacks,hp_dmg_cap_up_perc
0,paine,HP Attack+,1,0,0
1,paine,HP Attack++,1,0,0
2,paine,Break Attack,6,2,20
3,paine,Black Sky,5,5,20
4,paine,Sword Dance,3,3,10
5,paine,Sword Dance+,6,6,10
6,paine,Wild Throttle,8,2,15
7,paine,Gullwing Rush,2,2,400
8,paine,Dark Shroud,4,4,260


In [42]:
driver = Web

driver.get(character_link_dict['paine']['buffs_url'])



WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=115.0.5790.110)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00D9A813+48355]
	(No symbol) [0x00D2C4B1]
	(No symbol) [0x00C35358]
	(No symbol) [0x00C27D96]
	(No symbol) [0x00C27AB9]
	(No symbol) [0x00C367C0]
	(No symbol) [0x00C8C4D8]
	(No symbol) [0x00C7A536]
	(No symbol) [0x00C582DC]
	(No symbol) [0x00C593DD]
	GetHandleVerifier [0x00FFAABD+2539405]
	GetHandleVerifier [0x0103A78F+2800735]
	GetHandleVerifier [0x0103456C+2775612]
	GetHandleVerifier [0x00E251E0+616112]
	(No symbol) [0x00D35F8C]
	(No symbol) [0x00D32328]
	(No symbol) [0x00D3240B]
	(No symbol) [0x00D24FF7]
	BaseThreadInitThunk [0x762F7D59+25]
	RtlInitializeExceptionChain [0x771AB79B+107]
	RtlClearBits [0x771AB71F+191]
