In [7]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

In [8]:
def get_urls_by_type():
    '''Scrapes arkhamdb and returns a list of urls pages containing card information
       on each card for each card type'''

    # call function to return list of url's by catagory
    print("Getting investigator URL's...")
    gators = get_cata_urls('investigator')

    print("Getting asset URL's...")
    assets = get_cata_urls('asset')

    print("Getting event URL's...")
    events = get_cata_urls('event')

    print("Getting skill URL's...")
    skills = get_cata_urls('skill')

    return gators, assets, events, skills


def get_cata_urls(catagory):
    ''' Takes in a string containing a player card faction, Scrapes arkhamdb, and
        Returns a list of urls for containing information on each card of that catagory'''

    full_results = []

    # defines number of results pages to scrape
    if catagory in ('investigator', 'skill'):
        
        pages = 1
        
    elif catagory == 'event':
        
        pages = 2
        
    elif catagory == 'asset':
        
        pages = 4

    # itterates through pages in catagory
    for page in range(1,pages+1):

        url = f'https://arkhamdb.com/find?q=t%3A{catagory}&view=list&sort=name&decks=player&page={page}'

        # create request and soup objects
        html = requests.get(url)

        soup = BeautifulSoup(html.content, 'html.parser')

        # locate urls on page
        urls = soup.find(id='list')

        urls = urls.find_all('a', class_='card-tip')

        # convert urls to string and make a list
        urls = [str(url['href']) for url in urls]
                
        full_results.extend(urls)
                
    return full_results


def get_soup(url):
    '''Takes in a url as a string
       Returns html request result parsed using beautiful soup'''

    # create request and soup objects
    html = requests.get(url)

    soup = BeautifulSoup(html.content, 'html.parser')
    
    return soup

In [29]:
def get_health_sanity(soup):
    '''Takes in html object parsed by BeautifulSoup
       Returns health and sanity as a string'''

    try:
        
        health = re.search('Health:\s+(\d+)', str(soup.find('div'))).group(1).strip()
        
    except:
        
        health = '--'

    try:
        
        sanity = re.search('Sanity:\s+(\d+)', str(soup.find('div'))).group(1).strip()
        
    except:
        
        sanity = '--'

    return health, sanity

In [9]:
gators, assets, events, skills = get_urls_by_type()

Getting investigator URL's...
Getting asset URL's...
Getting event URL's...
Getting skill URL's...


In [50]:
soup = get_soup('https://arkhamdb.com/card/01095')

soup

<!DOCTYPE html>

<html>
<head>
<title>Elder Sign Amulet · ArkhamDB</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="yes" name="mobile-web-app-capable"/>
<link href="/icon-192.png" rel="icon" sizes="192x192"/>
<link href="/icon-120.png" rel="apple-touch-icon" sizes="120x120"/>
<meta content="Elder Sign Amulet · ArkhamDB" property="og:title"> <meta content="" property="og:description"> <meta content="https://arkhamdb.com/bundles/cards/01095.png" property="og:image">
<meta content="summary_large_image" name="twitter:card"/>
<link href="https://fonts.googleapis.com/css?family=Amiri:400,400italic,700,700italic|Julius+Sans+One|Open+Sans:400,400italic,700,700italic|Open+Sans+Condensed:300" rel="stylesheet" type="text/css"/>
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.4.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="https://cdnjs.cloudflare.com/ajax/libs/qtip2/2.1.1/jquery.qtip.css" rel="sty

In [51]:
soup.find('p', class_='card-type')

<p class="card-type">Asset. Accessory</p>

In [52]:
soup.find('p', class_='card-type').text

'Asset. Accessory'

In [53]:
soup.find('p', class_='card-type').text.replace('.','')

'Asset Accessory'

In [54]:
soup.find('p', class_='card-type').text.replace('.','').replace('Hand x2','handx2')

'Asset Accessory'

In [55]:
soup.find('p', class_='card-type').text.replace('.','').replace('hand x2','handx2').lower().strip()

'asset accessory'

In [56]:
tipe = soup.find('p', class_='card-type').text.replace('.','').replace('hand x2','handx2').lower().strip()

tipe.split(' ')

['asset', 'accessory']

In [57]:
if tipe.split(' ')[0] in ('asset','investigator'):
    
    health, sanity = get_health_sanity(soup)

In [58]:
health, sanity = get_health_sanity(soup)

In [59]:
health

'--'

In [60]:
sanity

'4'

In [61]:
sanity = re.search('Sanity:\s+(\d+)', str(soup.find('div'))).group(1).strip()

In [25]:
df.to_csv('player_cards.csv',index = False)

In [27]:
df.ability

0       You begin the game with Duke in play.</p><p>FA...
1       FAST ACTION During your turn, spend 2 resource...
2       FAST ACTION During your turn, spend 2 resource...
3       REACTION After 1 or more horror is placed on A...
4       REACTION After 1 or more horror is placed on A...
                              ...                        
1418    If this skill test is successful during an att...
1419    If this skill test is successful during an att...
1420    While Well-Dressed is committed to a skill tes...
1421    While you control a <b><i>Science</i></b> or <...
1422    This skill's icons subtract from your skill va...
Name: ability, Length: 1423, dtype: object

In [12]:
gators, assets, events, skills = get_urls_by_type()

Getting investigator URL's...


NameError: name 'get_cata_url' is not defined

In [13]:
def get_soup(url):
    '''Takes in a url as a string
       Returns html request result parsed using beautiful soup'''

    # create request and soup objects
    html = requests.get(url)

    soup = BeautifulSoup(html.content, 'html.parser')
    
    return soup


def get_text_for_icon(soup):
    '''Takes in request response as a string
       replaces html code indicating a game icon with a text representation
       Returns sting with replacements'''
    
    # replace icon html with matching word in all caps
    icon_types = [
                  'action',
                  'reaction',
                  'wild', 
                  'willpower', 
                  'combat', 
                  'agility', 
                  'intellect', 
                  'wild',
                  'curse', 
                  'bless',
                  'rogue',
                  'survivor',
                  'seeker',
                  'guardian',
                  'mystic',
                  'neutral',
                  'skull',
                  'tablet',
                  'cultist',
                  'elder sign']
    
    for icon in icon_types:
    
        soup = soup.replace(f'<span class="icon-{icon}" title="{icon.capitalize()}"></span>', f'{icon.upper()}')
        
        soup = soup.replace(f'<div class="card-text border-{icon}">\n<p>', '')

    soup = soup.replace(f'<span class="icon-wild" title="Any Skill"></span>', 'WILD')

    soup = soup.replace(f'<span class="icon-elder_sign" title="Elder Sign"></span>', 'ELDER_SIGN')

    soup = soup.replace(f'<span class="icon-elder_sign" title="Elder Thing"></span>', 'ELDER_THING')

    soup = soup.replace(f'<span class="icon-lightning" title="Fast Action"></span>', 'FAST_ACTION')

    soup = soup.replace(f'<span class="icon-auto_fail" title="Auto Fail"></span>', 'TENTACLES')

    return soup

In [14]:
gator = get_soup('https://arkhamdb.com/card/02006')
asset = get_soup('https://arkhamdb.com/card/05151')
#event = get_soup(events[15])
#skill = get_soup(skills[56])



In [29]:
faction = get_faction(gator)

first_faction = faction.split(' ')[0]

get_text_for_icon(str(gator.find('div', class_=f'card-text border-{first_faction}')))

"Zoey Samaras deck only.</p><p>REACTION After an enemy becomes engaged with you, exhaust Zoey's Cross and spend 1 resource: Deal 1 damage to that enemy.</p>\n</div>"

In [None]:
.text.replace('effect', 'ELDER SIGN')

In [57]:
gator.find('p', class_='card-type').text
   

'Asset. Accessory'

In [30]:
asset.find('div', class_='card-flavor small').text.replace('\n', '').replace('\t', '')

'"Please leave this to the professionals. If I find you poking around again, I will bring you in."'

In [46]:
gator.find_all('div', class_='card-flavor small')[1].text.replace('\n', '').replace('\t', '')

"Mateo Castile's life has not been easy since he became a priest. Recent decades in Mexico have been wracked with instability and conflict. Father Mateo struggled to balance his faith with the pragmatic concerns of preaching when the law told him he must not. But this is not what has assaulted Mateo's faith at its core. The nearby murders and kidnappings were not politically motivated, as he had suspected. A gruesome and horrid cult thrives at the heart of the brewing war. How, Mateo wonders, could a kind and loving God allow what he saw that night to exist?"

In [33]:
asset.find('span', class_='card-type')

AttributeError: 'NoneType' object has no attribute 'text'

In [30]:
'asset hand x2'.replace('hand x2','handx2')

'asset handx2'

In [55]:
gator.find('div', class_='card-pack').text.replace('\n', '').replace('\t', '').replace('.', '') 

'The Forgotten Age #4'

In [74]:
get_faction(asset).strip().lower()

'rogue'

In [None]:
.text.replace('\n', '').replace('\t', '')

In [56]:
faction = get_faction(asset).strip().lower()

first_faction = faction.split(' ')[0]

soup = asset.find('div', class_=f'card-text border-{first_faction}').text.replace('\n', '')

soup

'Uses (3 charges). If Blur has charges remaining: Evade. For this evasion attempt, you may use  instead of , and you get +1 skill value. If you succeed, spend 1 charge and you may take an additional action this turn. If you succeed by 0, take 1 damage.'

In [None]:
re.search('Sanity:\s+(\d)', str(gator.find('div'))).group(1

In [67]:
text = str(asset.find_all('div', class_='card-info-block'))

re.search('XP:\s+(\d)', text).group(1)

'1'

In [57]:
faction = get_faction(gator).lower().strip()
    
first_faction = faction.split(' ')[0]
    
gator.find_all('div', class_=f'card-text border-{first_faction}')[1].text.replace('\n', '')

'Deck Size: 30.Deckbuilding Options: Mystic cards () level 0-5, Blessed cards level 0-3, Neutral cards level 0-5.Deckbuilding Requirements (do not count toward deck size): The Codex of Ages, Serpents of Yig, 1 random basic weakness.Bonus Experience: You begin the campaign with 5 additional experience (does not affect the number of weaknesses you must take in Standalone Mode).'

In [72]:
get_icons(asset)

'COMBAT AGILITY'

NameError: name 'asset' is not defined

In [None]:
def get_value_set(col):
    
    values = []
    
    df[f'{col}'] = df[f'{col}'].split(' ')
    
    merged_values = col.tolist()
     
    for merged_value in merged_values:
        

    
        values.append(item)
        
    return set(values)
    
    



In [None]:
def clean_assets_traits(value):
    
    