In [116]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [96]:
def get_urls_by_type():
    '''Scrapes arkhamdb and returns a list of urls pages containing card information
       on each card for each card type'''

    # call function to return list of url's by catagory
    print("Getting investigator URL's...")
    gators = get_cata_url('investigator')

    print("Getting asset URL's...")
    assets = get_cata_url('asset')

    print("Getting event URL's...")
    events = get_cata_url('event')

    print("Getting skill URL's...")
    skills = get_cata_url('skill')

    return gators, assets, events, skills

def get_urls():
    
    urls = []
    
    tipes = ['investigator','asset', 'event', 'skill']
    
    for tipe in tipes:
        
        print(f'Getting {tipe} URLs!')
        
        urls.extend(get_cata_url(tipe))
        
    return urls


def get_cata_url(catagory):
    ''' Takes in a string containing a player card faction, Scrapes arkhamdb, and
        Returns a list of urls for containing information on each card of that catagory'''

    full_results = []

    # defines number of results pages to scrape
    if catagory in ('investigator', 'skill'):
        
        pages = 1
        
    elif catagory == 'event':
        
        pages = 2
        
    elif catagory == 'asset':
        
        pages = 4

    # itterates through pages in catagory
    for page in range(1,pages+1):

        url = f'https://arkhamdb.com/find?q=t%3A{catagory}&view=list&sort=name&decks=player&page={page}'

        # create request and soup objects
        html = requests.get(url)

        soup = BeautifulSoup(html.content, 'html.parser')

        # locate urls on page
        results = soup.find(id='list')

        results = results.find_all('a', class_='card-tip')

        # convert urls to string and make a list
        results = [str(result['href']) for result in results]
                
        full_results.extend(results)
                
    return full_results

def get_soup(url):
    '''Takes in a url for a card
       Returns html request result parsed using beautiful soup'''

    # create request and soup objects
    html = requests.get(url)

    soup = BeautifulSoup(html.content, 'html.parser')

    # locate urls on page and return
    return soup

In [108]:
def get_title(soup): 
    
    title = soup.find('a', class_='card-name card-tip').text.replace('\n', '')
    
    try:
        
        subtitle = soup.find('div', class_='card-subname small').text
        
    except:
        
        subtitle = ''
    
    
    return title + ' ' + subtitle


def get_faction(soup):
    
    faction = ''

    for item in soup.find_all('div', class_='card-faction'):

        faction += item.text.replace('\n', ' ')

    return faction.lower().strip()


def get_ability(soup, first_faction):
    
    try:
        
        ability = soup.find('div', class_=f'card-text border-{first_faction}').text.replace('effect', 'ELDER SIGN')
        
    except:
        
        ability = '--'
        
    return ability


def get_flavor(soup):
    
    try:
        
        flavor = results.find('div', class_='card-flavor small')[1].text.replace('\n', '').replace('\t', '')

    except:
        
        flavor = '--'
        
    return flavor


def get_cost(soup):
    
    text = str(asset.find_all('div', class_='card-info-block'))

    return re.search('Cost:\s+(\d)', text).group(1)


def get_xp(soup):
    
    text = str(asset.find_all('div', class_='card-info-block'))

    re.search('XP:\s+(\d)', text).group(1)
    
    
def get_icons(results):
    '''Takes in request results for an arkhamdb page containing player card data
        Returns a string represintation of test icons on the card'''
      
    icons = ''

    # list containing each icon type
    icon_types = ['wild', 'willpower', 'combat', 'agility', 'intellect']

    # itterate through icon types
    for stat in icon_types:

        # get number of that icon on card from request results
        num_icons = len(results.find_all('span', class_=f'icon icon-large icon-{stat} color-{stat}'))

        # add that icon name to a string for each time it appears in request results
        for icon in range(num_icons):

            icons += f'{stat} '
            
    return icons.upper()[:-1]

In [95]:
get_card_df(invest)

Getting investigator cards!
Getting asset cards!
Getting event cards!
Getting skill cards!


['https://arkhamdb.com/card/02005',
 'https://arkhamdb.com/card/01003',
 'https://arkhamdb.com/card/01503',
 'https://arkhamdb.com/card/01004',
 'https://arkhamdb.com/card/01504',
 'https://arkhamdb.com/card/03004',
 'https://arkhamdb.com/card/10009',
 'https://arkhamdb.com/card/07002',
 'https://arkhamdb.com/card/09011',
 'https://arkhamdb.com/card/08016',
 'https://arkhamdb.com/card/04005',
 'https://arkhamdb.com/card/05001',
 'https://arkhamdb.com/card/98010',
 'https://arkhamdb.com/card/09001',
 'https://arkhamdb.com/card/09018',
 'https://arkhamdb.com/card/01002',
 'https://arkhamdb.com/card/01502',
 'https://arkhamdb.com/card/08001',
 'https://arkhamdb.com/card/09015',
 'https://arkhamdb.com/card/07004',
 'https://arkhamdb.com/card/05004',
 'https://arkhamdb.com/card/04004',
 'https://arkhamdb.com/card/04003',
 'https://arkhamdb.com/card/98019',
 'https://arkhamdb.com/card/10015',
 'https://arkhamdb.com/card/60201',
 'https://arkhamdb.com/card/60401',
 'https://arkhamdb.com/card/

In [114]:
def get_card_info(soup):
    
    # get descriptors common to all card types
    title = get_title(soup)

    faction = get_faction(soup)
    
    first_faction = faction.split(' ')[0] # for subsequent searches requiring faction

    tipe = soup.find('p', class_='card-type').text.replace('.','').lower()
    
    try:
    
        traits = soup.find('p', class_='card-traits').text
    
    except:
        
        traits = '--'
        
    ability = get_ability(soup, first_faction)

    artist = soup.find('div', class_='card-illustrator').text.replace('\n', '').replace('\t', '')

    expansion = soup.find('div', class_='card-pack').text.replace('\n', '').replace('\t', '').replace('.', '') 

    flavor = get_flavor(soup)
    
    # get sets of descriptors not common to all card types setmisssing values to "--"
    
    if tipe.split(' ')[0] in ('asset','event','skill'):
    
        icons = get_icons(soup)
    
        xp = get_xp(soup)
        
    else:
        
        icons = '--'
        
        xp = '--'
        
    if tipe.split(' ')[0] in ('asset','investigator'):
        
        try:
        
            health = re.search('Health:\s+(\d)', str(soup.find('div'))).group(1)
            
        except:
            
            health = '--'

        try:
            
            sanity = re.search('Sanity:\s+(\d)', str(soup.find('div'))).group(1)
            
        except:
            
            sanity = '--'
        
    else:
        
        health = '--'
        
        sanaty = '--'
            
    if tipe.split(' ')[0] in ('asset','event'):
        
        cost = get_cost(soup)
                        
    else:
        
        cost = '--'
        
        
    if tipe.split(' ')[0] in ('investigator'):
        
        willpower = soup.find('li', title='Willpower').text

        intellect = soup.find('li', title='Intellect').text

        combat = soup.find('li', title='Combat').text

        agility = soup.find('li', title='Agility').text

        health = re.search('Health:\s+(\d)', str(soup.find('div'))).group(1)

        sanity = re.search('Sanity:\s+(\d)', str(soup.find('div'))).group(1)

        deck_building = soup.find('div', class_=f'card-text border-{first_faction}').text.replace('\n', '')
        
    else:
        
        willpower = '--'
        intellect = '--'
        combat = '--'
        agility = '--'
        health = '--'
        sanity = '--'
        deck_building = '--'
        
    return [title,
            faction,
            tipe,
            cost,
            traits,
            ability,
            icons,
            willpower,
            intellect,
            combat,
            agility,
            health,
            sanity,
            xp,
            artist,
            expansion,
            flavor,
            deck_building]


def get_card_df(urls):
    '''Takes in urls for investigator cards and 
       Returns a df containing each cards information'''

    # dictionary with empty traits
    card_dict = {'title':[],
                 'faction':[],
                 'type':[],
                 'cost':[],
                 'traits':[],  
                 'ability':[],
                 'icons':[],  
                 'willpower':[],
                 'intellect':[],
                 'combat':[],
                 'agility':[],                       
                 'health':[],
                 'sanity':[],
                 'xp':[],
                 'artist':[],
                 'expansion':[],
                 'flavor':[],
                 'deck_building':[],
                 'url':[]}

    print("Getting card descriptors...")

    # for each url get player card info from that page and add each element to skill_traits
    for url in urls:

        # make html request to arkham db and parse using BS
        soup = get_soup(url)

        # get list of card elements card elements
        descriptors = get_card_info(soup)

        descriptors.append(url)

        print(f'Getting {descriptors[0]}...')

        # itterate through card element titles and add each to a dictionary
        for i, key in enumerate(card_dict):

            card_dict[key].append(descriptors[i])

    print("Making dataframe...")

    return pd.DataFrame(investigator_dict)


In [97]:
urls = get_urls()

Getting investigator URLs!
Getting asset URLs!
Getting event URLs!
Getting skill URLs!


In [117]:
df = get_card_df(urls)

Getting card descriptors...
Getting "Ashcan" Pete The Drifter...
Getting "Skids" O'Toole The Ex-Con...
Getting "Skids" O'Toole The Ex-Con...
Getting Agnes Baker The Waitress...
Getting Agnes Baker The Waitress...
Getting Akachi Onyele The Shaman...
Getting Alessandra Zorzi The Countess...
Getting Amanda Sharpe The Student...
Getting Amina Zidane The Operator...
Getting Bob Jenkins The Salesman...
Getting Calvin Wright The Haunted...
Getting Carolyn Fern The Psychologist...
Getting Carolyn Fern The Psychologist...
Getting Carson Sinclair The Butler...
Getting Charlie Kane The Politician...
Getting Daisy Walker The Librarian...
Getting Daisy Walker The Librarian...
Getting Daniela Reyes The Mechanic...
Getting Darrell Simmons The Photographer...
Getting Dexter Drake The Magician...
Getting Diana Stanley The Redeemed Cultist...
Getting Father Mateo The Priest...
Getting Finn Edwards The Bootlegger...
Getting Gloria Goldberg The Writer...
Getting Hank Samson The Farmhand...
Getting Harvey 

Getting Dario El-Amin Unscrupulous Investor...
Getting Dark Horse ...
Getting Dark Horse ...
Getting Dark Ritual ...
Getting Darrell's Kodak Proof in the Pudding...
Getting David Renfield Esteemed Eschatologist...
Getting Dayana Esperence Deals with "Devils"...
Getting De Vermis Mysteriis Signs of the Black Stars...
Getting Death • XIII Free from the Past...
Getting Decorated Skull Doom Begets Doom...
Getting Decorated Skull Doom Begets Doom...
Getting Déjà Vu ...
Getting Delilah O'Rourke Syndicate Assassin...
Getting Dendromorphosis "Natural" Transformation...
Getting Detective's Colt 1911s ...
Getting Devil Friend or Foe?...
Getting Dig Deep ...
Getting Dig Deep ...
Getting Dig Deep ...
Getting Dig Deep ...
Getting Directive Due Diligence...
Getting Directive Red Tape...
Getting Directive Consult Experts...
Getting Directive Seek the Truth...
Getting Directive Leave No Doubt...
Getting Dirty Fighting ...
Getting Disc of Itzamna Protective Amulet...
Getting Disc of Itzamna Protective 

Getting Lone Wolf ...
Getting Lonnie Ritter Feisty Mechanic...
Getting Lucky Cigarette Case ...
Getting Lucky Cigarette Case ...
Getting Lucky Cigarette Case ...
Getting Lucky Dice ...Or Are They?...
Getting Lucky Dice ...Or Are They?...
Getting Lupara ...
Getting M1918 BAR ...
Getting Machete ...
Getting Machete ...
Getting Madame Labranche Mysterious Benefactress...
Getting Magnifying Glass ...
Getting Magnifying Glass ...
Getting Magnifying Glass ...
Getting Magnifying Glass ...
Getting Maimed Hand ...
Getting Mariner's Compass ...
Getting Mariner's Compass ...
Getting Martyr's Vambrace Remnant of the Unknown...
Getting Matchbox ...
Getting Mauser C96 ...
Getting Mauser C96 ...
Getting Meat Cleaver ...
Getting Mechanic's Wrench ...
Getting Medical Student ...
Getting Medical Texts ...
Getting Medical Texts ...
Getting Medical Texts ...
Getting Michael Leigh Experienced Hunter...
Getting Microscope ...
Getting Microscope ...
Getting Mind's Eye ...
Getting Miskatonic Archaeology Fundi

Getting The Hierophant • V Your True Master Awaits...
Getting The Hungering Blade Calamitous Blade of Celephaïs...
Getting The Key of Solomon Secrets of the Unknown...
Getting The King in Yellow Act 1...
Getting The Moon • XVIII Message from Your Inner Self...
Getting The Necronomicon John Dee Translation...
Getting The Necronomicon John Dee Translation...
Getting The Necronomicon Petrus de Dacia Translation...
Getting The Necronomicon John Dee Translation...
Getting The Red Clock Broken but Reliable...
Getting The Red Clock Broken but Reliable...
Getting The Red-Gloved Man He Was Never There...
Getting The Silver Moth ...
Getting The Skeleton Key ...
Getting The Star • XVII You Have Been Chosen...
Getting The Tower • XVI Circumstances Beyond Your Control...
Getting The World • XXI The Journey is Complete...
Getting Thermos ...
Getting Thieves' Kit ...
Getting Thieves' Kit ...
Getting Tidal Memento ...
Getting Timeworn Brand ...
Getting Token of Faith ...
Getting Token of Faith ...
Get

Getting Grift ...
Getting Grit Your Teeth ...
Getting Guidance ...
Getting Guidance ...
Getting Guided by Faith ...
Getting Hallow ...
Getting Hand of Fate ...
Getting Hand-Eye Coordination ...
Getting Harmony Restored ...
Getting Heed the Dream ...
Getting Heroic Rescue ...
Getting Heroic Rescue ...
Getting Hidden Pocket ...
Getting Hiding Spot ...
Getting Hit and Run ...
Getting Hold Up ...
Getting Honed Instinct ...
Getting Hot Streak ...
Getting Hot Streak ...
Getting Hot Streak ...
Getting Hypnotic Gaze ...
Getting Hypnotic Gaze ...
Getting Hypnotic Gaze ...
Getting Impromptu Barrier ...
Getting Improvisation ...
Getting Improvised Weapon ...
Getting In the Shadows ...
Getting Infighting ...
Getting Intel Report ...
Getting Interrogate ...
Getting Join the Caravan ...
Getting Jury-Rig ...
Getting Keep Faith ...
Getting Keep Faith ...
Getting Kicking the Hornet's Nest ...
Getting Knowledge is Power ...
Getting Lesson Learned ...
Getting Lifeline ...
Getting Live and Learn ...
Getti

Getting Overpower ...
Getting Perception ...
Getting Perception ...
Getting Perception ...
Getting Persistence ...
Getting Plan of Action ...
Getting Predestined ...
Getting Prescient ...
Getting Promise of Power ...
Getting Prophesy ...
Getting Providential ...
Getting Purified ...
Getting Quick Thinking ...
Getting Reckless ...
Getting Reckless Assault ...
Getting Resourceful ...
Getting Rise to the Occasion ...
Getting Rise to the Occasion ...
Getting Run For Your Life ...
Getting Savant ...
Getting Say Your Prayers ...
Getting Seal of the Elder Sign ...
Getting Self-Sacrifice ...
Getting Sharp Vision ...
Getting Signum Crucis ...
Getting Skeptic ...
Getting Steadfast ...
Getting Strength in Numbers ...
Getting Stroke of Luck ...
Getting Strong-Armed ...
Getting Stunning Blow ...
Getting Surprising Find ...
Getting Survey the Area ...
Getting Survival Instinct ...
Getting Survival Instinct ...
Getting Survival Instinct ...
Getting Take Heart ...
Getting Take Heart ...
Getting Take t

NameError: name 'investigator_dict' is not defined

In [None]:
df

In [5]:
gators, assets, events, skills = get_urls_by_type()

Getting investigator URL's...
Getting asset URL's...
Getting event URL's...
Getting skill URL's...


In [69]:
gator = get_soup('https://arkhamdb.com/card/04004')
asset = get_soup('https://arkhamdb.com/card/07305')
event = get_soup(events[15])
skill = get_soup(skills[56])

soups = [gator,asset,event,skill]

In [89]:
for soup in soups:
    
    print(get_card_info(soup))

['Father Mateo The Priest', 'mystic', 'investigator', 'Believer. Warden.', '\n When an investigator reveals an  chaos token: Cancel that token and treat it as an  token, instead. (Limit once per game.) ELDER SIGN: You automatically succeed. After this test ends, either (choose one): - Draw 1 card and gain 1 resource. - If it is your turn, you may take an additional action this turn.\n', 'Magali Villeneuve', 'The Forgotten Age #4', '--', '--', '--', '6', '8', '--', '     4', '     3', '     2', '     3', '6', '8', 'Deck Size: 30.Deckbuilding Options: Mystic cards () level 0-5, Blessed cards level 0-3, Neutral cards level 0-5.Deckbuilding Requirements (do not count toward deck size): The Codex of Ages, Serpents of Yig, 1 random basic weakness.Bonus Experience: You begin the campaign with 5 additional experience (does not affect the number of weaknesses you must take in Standalone Mode).']
['.25 Automatic ', 'rogue', 'asset hand', 'Item. Weapon. Firearm. Illicit.', '\nFast. Uses (4 ammo).

In [32]:
soup.find('div', class_='card-illustrator').text.replace('\n', '').replace('\t', '')

'Jeff Lee Johnson'

In [78]:
event.find('span', class_='card-type').text

AttributeError: 'NoneType' object has no attribute 'text'

In [55]:
gator.find('div', class_='card-pack').text.replace('\n', '').replace('\t', '').replace('.', '') 

'The Forgotten Age #4'

In [74]:
get_faction(asset).strip().lower()

'rogue'

In [None]:
.text.replace('\n', '').replace('\t', '')

In [56]:
faction = get_faction(asset).strip().lower()

first_faction = faction.split(' ')[0]

soup = asset.find('div', class_=f'card-text border-{first_faction}').text.replace('\n', '')

soup

'Uses (3 charges). If Blur has charges remaining: Evade. For this evasion attempt, you may use  instead of , and you get +1 skill value. If you succeed, spend 1 charge and you may take an additional action this turn. If you succeed by 0, take 1 damage.'

In [None]:
re.search('Sanity:\s+(\d)', str(gator.find('div'))).group(1

In [67]:
text = str(asset.find_all('div', class_='card-info-block'))

re.search('XP:\s+(\d)', text).group(1)

'1'

In [57]:
faction = get_faction(gator).lower().strip()
    
first_faction = faction.split(' ')[0]
    
gator.find_all('div', class_=f'card-text border-{first_faction}')[1].text.replace('\n', '')

'Deck Size: 30.Deckbuilding Options: Mystic cards () level 0-5, Blessed cards level 0-3, Neutral cards level 0-5.Deckbuilding Requirements (do not count toward deck size): The Codex of Ages, Serpents of Yig, 1 random basic weakness.Bonus Experience: You begin the campaign with 5 additional experience (does not affect the number of weaknesses you must take in Standalone Mode).'

In [72]:
get_icons(asset)

'COMBAT AGILITY'