In [21]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
def get_urls_by_type():
    '''Scrapes arkhamdb and returns a list of urls pages containing card information
       on each card for each card type'''

    # call function to return list of url's by catagory
    print("Getting investigator URL's...")
    gators = get_cata_url('investigator')

    print("Getting asset URL's...")
    assets = get_cata_url('asset')

    print("Getting event URL's...")
    events = get_cata_url('event')

    print("Getting skill URL's...")
    skills = get_cata_url('skill')

    return gators, assets, events, skills

def get_urls():
    
    urls = []
    
    tipes = ['investigator','asset', 'event', 'skill']
    
    for tipe in tipes:
        
        print(f'Getting {tipe} URLs!')
        
        urls.extend(get_cata_urls(tipe))
        
    return urls


def get_cata_urls(catagory):
    ''' Takes in a string containing a player card faction, Scrapes arkhamdb, and
        Returns a list of urls for containing information on each card of that catagory'''

    full_results = []

    # defines number of results pages to scrape
    if catagory in ('investigator', 'skill'):
        
        pages = 1
        
    elif catagory == 'event':
        
        pages = 2
        
    elif catagory == 'asset':
        
        pages = 4

    # itterates through pages in catagory
    for page in range(1,pages+1):

        url = f'https://arkhamdb.com/find?q=t%3A{catagory}&view=list&sort=name&decks=player&page={page}'

        # create request and soup objects
        html = requests.get(url)

        soup = BeautifulSoup(html.content, 'html.parser')

        # locate urls on page
        urls = soup.find(id='list')

        urls = urls.find_all('a', class_='card-tip')

        # convert urls to string and make a list
        urls = [str(url['href']) for url in urls]
                
        full_results.extend(urls)
                
    return full_results

In [45]:
def get_title(soup): 
    
    title = soup.find('a', class_='card-name card-tip').text.replace('\n', '')
    
    try:
        
        subtitle = f" :{soup.find('div', class_='card-subname small').text}"
        
        
        
    except:
        
        subtitle = ''
    
    
    return title + subtitle


def get_faction(soup):
    
    faction = ''

    for item in soup.find_all('div', class_='card-faction'):

        faction += item.text.replace('\n', ' ')

    return faction.lower().strip()


def get_ability(soup, first_faction):
    
    try:
        
        ability = soup.find('div', class_=f'card-text border-{first_faction}').text.replace('effect', 'ELDER SIGN')
        
    except:
        
        ability = '--'
        
    return ability


def get_flavor(soup):
    
    try:
        
        flavor = soup.find('div', class_='card-flavor small')[1].text.replace('\n', '').replace('\t', '')

    except:
        
        flavor = '--'
        
    return flavor


def get_cost(soup):
    
    text = str(soup.find_all('div', class_='card-info-block'))

    try:
    
        text = re.search('Cost:\s+(\d)', text).group(1)
        
    except:
        
        text = 'X'
        
    return text


def get_xp(soup):
    
    text = str(soup.find_all('div', class_='card-info-block'))

    try:
        
        text = re.search('XP:\s+(\d)', text).group(1)
        
    except:
        
        text = '--'
        
    return text
    
    
def get_icons(soup):
    '''Takes in request results for an arkhamdb page containing player card data
        Returns a string represintation of test icons on the card'''
      
    icons = ''

    # list containing each icon type
    icon_types = ['wild', 'willpower', 'combat', 'agility', 'intellect']

    # itterate through icon types
    for stat in icon_types:

        # get number of that icon on card from request results
        num_icons = len(soup.find_all('span', class_=f'icon icon-large icon-{stat} color-{stat}'))

        # add that icon name to a string for each time it appears in request results
        for icon in range(num_icons):

            icons += f'{stat} '
            
    return icons.upper()[:-1]

In [38]:
def get_card_info(soup):
    
    # get descriptors common to all card types
    title = get_title(soup)

    faction = get_faction(soup)
    
    first_faction = faction.split(' ')[0] # for subsequent searches requiring faction

    tipe = soup.find('p', class_='card-type').text.replace('.','').lower()
    
    try:
    
        traits = soup.find('p', class_='card-traits').text
    
    except:
        
        traits = '--'
        
    ability = get_ability(soup, first_faction)

    artist = soup.find('div', class_='card-illustrator').text.replace('\n', '').replace('\t', '')

    expansion = soup.find('div', class_='card-pack').text.replace('\n', '').replace('\t', '').replace('.', '') 

    flavor = get_flavor(soup)
    
    # get sets of descriptors not common to all card types setmisssing values to "--"
    
    if tipe.split(' ')[0] in ('asset','event','skill'):
    
        icons = get_icons(soup)
    
        xp = get_xp(soup)
        
    else:
        
        icons = '--'
        
        xp = '--'
        
    if tipe.split(' ')[0] in ('asset','investigator'):
        
        try:
        
            health = re.search('Health:\s+(\d)', str(soup.find('div'))).group(1)
            
        except:
            
            health = '--'

        try:
            
            sanity = re.search('Sanity:\s+(\d)', str(soup.find('div'))).group(1)
            
        except:
            
            sanity = '--'
        
    else:
        
        health = '--'
        
        sanaty = '--'
            
    if tipe.split(' ')[0] in ('asset','event'):
        
        cost = get_cost(soup)
                        
    else:
        
        cost = '--'
        
        
    if tipe.split(' ')[0] in ('investigator'):
        
        willpower = soup.find('li', title='Willpower').text

        intellect = soup.find('li', title='Intellect').text

        combat = soup.find('li', title='Combat').text

        agility = soup.find('li', title='Agility').text

        health = re.search('Health:\s+(\d)', str(soup.find('div'))).group(1)

        sanity = re.search('Sanity:\s+(\d)', str(soup.find('div'))).group(1)

        deck_building = soup.find('div', class_=f'card-text border-{first_faction}').text.replace('\n', '')
        
    else:
        
        willpower = '--'
        intellect = '--'
        combat = '--'
        agility = '--'
        health = '--'
        sanity = '--'
        deck_building = '--'
        
    return [title,
            faction,
            tipe,
            cost,
            traits,
            ability,
            icons,
            willpower,
            intellect,
            combat,
            agility,
            health,
            sanity,
            xp,
            artist,
            expansion,
            flavor,
            deck_building]


def get_card_df(urls):
    '''Takes in urls for investigator cards and 
       Returns a df containing each cards information'''

    # dictionary with empty traits
    card_dict = {'title':[],
                 'faction':[],
                 'type':[],
                 'cost':[],
                 'traits':[],  
                 'ability':[],
                 'icons':[],  
                 'willpower':[],
                 'intellect':[],
                 'combat':[],
                 'agility':[],                       
                 'health':[],
                 'sanity':[],
                 'xp':[],
                 'artist':[],
                 'expansion':[],
                 'flavor':[],
                 'deck_building':[],
                 'url':[]}

    print("Getting card descriptors...")

    # for each url get player card info from that page and add each element to skill_traits
    for url in urls:

        # make html request to arkham db and parse using BS
        soup = get_soup(url)

        # get list of card elements card elements
        descriptors = get_card_info(soup)

        descriptors.append(url)

        print(f'Getting {descriptors[0]}!')

        # itterate through card element titles and add each to a dictionary
        for i, key in enumerate(card_dict):

            card_dict[key].append(descriptors[i])

    print("Making dataframe...")

    return pd.DataFrame(card_dict)


In [29]:
urls = get_urls()

Getting investigator URLs!
Getting asset URLs!
Getting event URLs!
Getting skill URLs!


In [39]:
df = get_card_df(urls)

Getting card descriptors...
Getting "Ashcan" Pete The Drifter!
Getting "Skids" O'Toole The Ex-Con!
Getting "Skids" O'Toole The Ex-Con!
Getting Agnes Baker The Waitress!
Getting Agnes Baker The Waitress!
Getting Akachi Onyele The Shaman!
Getting Alessandra Zorzi The Countess!
Getting Amanda Sharpe The Student!
Getting Amina Zidane The Operator!
Getting Bob Jenkins The Salesman!
Getting Calvin Wright The Haunted!
Getting Carolyn Fern The Psychologist!
Getting Carolyn Fern The Psychologist!
Getting Carson Sinclair The Butler!
Getting Charlie Kane The Politician!
Getting Daisy Walker The Librarian!
Getting Daisy Walker The Librarian!
Getting Daniela Reyes The Mechanic!
Getting Darrell Simmons The Photographer!
Getting Dexter Drake The Magician!
Getting Diana Stanley The Redeemed Cultist!
Getting Father Mateo The Priest!
Getting Finn Edwards The Bootlegger!
Getting Gloria Goldberg The Writer!
Getting Hank Samson The Farmhand!
Getting Harvey Walters The Professor!
Getting Jacqueline Fine The

Getting Dendromorphosis "Natural" Transformation!
Getting Detective's Colt 1911s !
Getting Devil Friend or Foe?!
Getting Dig Deep !
Getting Dig Deep !
Getting Dig Deep !
Getting Dig Deep !
Getting Directive Due Diligence!
Getting Directive Red Tape!
Getting Directive Consult Experts!
Getting Directive Seek the Truth!
Getting Directive Leave No Doubt!
Getting Dirty Fighting !
Getting Disc of Itzamna Protective Amulet!
Getting Disc of Itzamna Protective Amulet!
Getting Disc of Itzamna Protective Amulet!
Getting Discipline Alignment of Spirit!
Getting Discipline Quiescence of Thought!
Getting Discipline Prescience of Fate!
Getting Discipline Balance of Body!
Getting Disguise !
Getting Dissection Tools !
Getting Divination !
Getting Divination !
Getting Double, Double !
Getting Down the Rabbit Hole !
Getting Dowsing Rod !
Getting Dowsing Rod !
Getting Dr. Charles West III Knows His Purpose!
Getting Dr. Elli Horowitz Assistant Curator!
Getting Dr. Milan Christopher Professor of Entomology!


Getting Mists of R'lyeh !
Getting Mists of R'lyeh !
Getting Mitch Brown Sole Survivor!
Getting Mk 1 Grenades !
Getting Molly Maxwell The Exotic Morgana!
Getting Moon Pendant !
Getting Moonstone !
Getting Mouse Mask The Meek Watcher!
Getting Moxie !
Getting Moxie !
Getting Mr. "Rook" Dealer in Secrets!
Getting Mysterious Raven !
Getting Nephthys Huntress of Bast!
Getting Newspaper !
Getting Newspaper !
Getting Nightmare Bauble !
Getting Nine of Rods Every Trial a Lesson!
Getting Nkosi Mabati Enigmatic Warlock!
Getting Obfuscation !
Getting Observed !
Getting Obsidian Bracelet !
Getting Occult Lexicon !
Getting Occult Lexicon !
Getting Occult Reliquary Dubious Source!
Getting Occult Scraps !
Getting Ofuda !
Getting Old Book of Lore !
Getting Old Book of Lore !
Getting Old Book of Lore !
Getting Old Book of Lore !
Getting Old Hunting Rifle !
Getting Old Keyring !
Getting Old Keyring !
Getting Old Shotgun !
Getting Olive McBride Will Try Anything Once!
Getting Olive McBride !
Getting On Yo

Getting Underworld Support !
Getting Unscrupulous Loan !
Getting Until the End of Time !
Getting Vault of Knowledge !
Getting Venturer !
Getting Versatile !
Getting Vow of Drzytelech !
Getting Well Connected !
Getting Well Connected !
Getting Well Prepared !
Getting Wendy's Amulet !
Getting Wendy's Amulet !
Getting Wendy's Amulet !
Getting Whitton Greene Hunter of Rare Books!
Getting Whitton Greene Hunter of Rare Books!
Getting Wicked Athame Cursed Blade!
Getting Wish Eater Jewel of the Gods!
Getting Wither !
Getting Wither !
Getting Wolf Mask The Moon's Sire!
Getting Wounded Bystander On Death's Doorstep!
Getting Yaotl Lost Son of Eztli!
Getting Zeal !
Getting Zoey's Cross Symbol of Righteousness!
Getting Zoey's Cross Symbol of Righteousness!
Getting "Eat lead!" !
Getting "Eat lead!" !
Getting "Fool me once..." !
Getting "Get behind me!" !
Getting "Get over here!" !
Getting "Get over here!" !
Getting "Hit me!" !
Getting "I'll Pay You Back!" !
Getting "I'll see you in hell!" !
Getting 

Getting On the Lam !
Getting On the Lam !
Getting On the Lam !
Getting On the Trail !
Getting On the Trail !
Getting One in the Chamber !
Getting One-Two Punch !
Getting One-Two Punch !
Getting Oops! !
Getting Oops! !
Getting Oops! !
Getting Open Gate !
Getting Parallel Fates !
Getting Parallel Fates !
Getting Pay Day !
Getting Pay Your Due !
Getting Perseverance !
Getting Persuasion !
Getting Pilfer !
Getting Pilfer !
Getting Power Word !
Getting Practice Makes Perfect !
Getting Predator or Prey !
Getting Premonition !
Getting Prepared for the Worst !
Getting Prepared for the Worst !
Getting Preposterous Sketches !
Getting Preposterous Sketches !
Getting Preposterous Sketches !
Getting Protecting the Anirniq !
Getting Pushed to the Limit !
Getting Putrescent Rot !
Getting Quantum Flux !
Getting Quantum Paradox !
Getting Quick Getaway !
Getting Radiant Smite !
Getting Read the Signs !
Getting Read the Signs !
Getting Recharge !
Getting Recharge !
Getting Refine !
Getting Regurgitation 

In [41]:
df.to_csv('player_cards.csv',index = False)

In [43]:
df.type.value_counts()

event                             450
asset                             213
asset hand                        183
skill                             125
asset ally                        107
asset arcane                       89
asset accessory                    76
investigator                       70
asset hand x2                      33
asset body                         25
asset tarot                        12
asset hand arcane                   6
event basic weakness                4
event weakness                      4
skill weakness                      4
asset hand weakness                 3
asset ally weakness                 2
asset weakness                      2
asset ally arcane                   2
asset tarot basic weakness          2
asset arcane basic weakness         1
asset accessory basic weakness      1
asset body hand x2                  1
asset arcane x2                     1
asset hand basic weakness           1
asset basic weakness                1
asset hand x

In [5]:
gators, assets, events, skills = get_urls_by_type()

Getting investigator URL's...
Getting asset URL's...
Getting event URL's...
Getting skill URL's...


In [69]:
gator = get_soup('https://arkhamdb.com/card/04004')
asset = get_soup('https://arkhamdb.com/card/07305')
event = get_soup(events[15])
skill = get_soup(skills[56])

soups = [gator,asset,event,skill]

In [89]:
for soup in soups:
    
    print(get_card_info(soup))

['Father Mateo The Priest', 'mystic', 'investigator', 'Believer. Warden.', '\n When an investigator reveals an  chaos token: Cancel that token and treat it as an  token, instead. (Limit once per game.) ELDER SIGN: You automatically succeed. After this test ends, either (choose one): - Draw 1 card and gain 1 resource. - If it is your turn, you may take an additional action this turn.\n', 'Magali Villeneuve', 'The Forgotten Age #4', '--', '--', '--', '6', '8', '--', '     4', '     3', '     2', '     3', '6', '8', 'Deck Size: 30.Deckbuilding Options: Mystic cards () level 0-5, Blessed cards level 0-3, Neutral cards level 0-5.Deckbuilding Requirements (do not count toward deck size): The Codex of Ages, Serpents of Yig, 1 random basic weakness.Bonus Experience: You begin the campaign with 5 additional experience (does not affect the number of weaknesses you must take in Standalone Mode).']
['.25 Automatic ', 'rogue', 'asset hand', 'Item. Weapon. Firearm. Illicit.', '\nFast. Uses (4 ammo).

In [32]:
soup.find('div', class_='card-illustrator').text.replace('\n', '').replace('\t', '')

'Jeff Lee Johnson'

In [78]:
event.find('span', class_='card-type').text

AttributeError: 'NoneType' object has no attribute 'text'

In [55]:
gator.find('div', class_='card-pack').text.replace('\n', '').replace('\t', '').replace('.', '') 

'The Forgotten Age #4'

In [74]:
get_faction(asset).strip().lower()

'rogue'

In [None]:
.text.replace('\n', '').replace('\t', '')

In [56]:
faction = get_faction(asset).strip().lower()

first_faction = faction.split(' ')[0]

soup = asset.find('div', class_=f'card-text border-{first_faction}').text.replace('\n', '')

soup

'Uses (3 charges). If Blur has charges remaining: Evade. For this evasion attempt, you may use  instead of , and you get +1 skill value. If you succeed, spend 1 charge and you may take an additional action this turn. If you succeed by 0, take 1 damage.'

In [None]:
re.search('Sanity:\s+(\d)', str(gator.find('div'))).group(1

In [67]:
text = str(asset.find_all('div', class_='card-info-block'))

re.search('XP:\s+(\d)', text).group(1)

'1'

In [57]:
faction = get_faction(gator).lower().strip()
    
first_faction = faction.split(' ')[0]
    
gator.find_all('div', class_=f'card-text border-{first_faction}')[1].text.replace('\n', '')

'Deck Size: 30.Deckbuilding Options: Mystic cards () level 0-5, Blessed cards level 0-3, Neutral cards level 0-5.Deckbuilding Requirements (do not count toward deck size): The Codex of Ages, Serpents of Yig, 1 random basic weakness.Bonus Experience: You begin the campaign with 5 additional experience (does not affect the number of weaknesses you must take in Standalone Mode).'

In [72]:
get_icons(asset)

'COMBAT AGILITY'