In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def get_soup(url):
    """
    The webpage scrapper for list of all dogs. Design for rocketdogrescue.com
    param:
        url: the link to webpage holding dog's infor, can be the archive or to
            _be adopted info.
    return:
        soup: the object that holds all information.
    """
    try:
        response = requests.get(url, headers={'User-Agent': "PuppyLover"})
    except ValueError as e:
        print(str(e))
        return
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    return soup

In [3]:
url = 'https://pocketgems.com/about/'
soup = get_soup(url)

### HTML Page for Dixin

<figure class="item" template="">
<div class="image" style="background-image: url(https://s3-us-west-2.amazonaws.com/pocketgems-assets/2018/09/21235721/Dixin_Yan.jpg);"></div>
<div class="user-popup">
<div class="arrow"></div>
<img alt="Close icon" class="close-icon" src="https://pocketgems.com/wp-content/themes/pocketgems/assets/images/x@2x.png"/>
<h4>Background</h4>
<p><p><span data-sheets-userformat='{"2":513,"3":{"1":0},"12":0}' data-sheets-value='{"1":2,"2":"MS Data Science from USF. BS Commerce from UVA. Love puzzles and dogs, plus bubble tea. Was in a completely different industry before joining PG:)"}'>MS Data Science from USF. BS Commerce from UVA. Love puzzles and dogs, plus bubble tea. Was in a completely different industry before joining PG:)</span></p>
</p>
<h4>Fave old school video game</h4>
<p><p><span data-sheets-userformat='{"2":513,"3":{"1":0},"12":0}' data-sheets-value='{"1":2,"2":"Monopoly"}'>Monopoly</span></p>
</p>
<h4>If I could be a game character</h4>
<p><p><span data-sheets-userformat='{"2":513,"3":{"1":0},"12":0}' data-sheets-value="{&quot;1&quot;:2,&quot;2&quot;:&quot;Mario's Cap&quot;}">Mario’s Cap</span></p>
</p>
</div>
<figcaption>
<strong>Dixin</strong> Roller Coaster Terminator                        </figcaption>
</figure>

In [4]:
terry = soup.find_all("figure", {"class": "item"})[0]

In [5]:
terry

<figure class="item" template="">
<div class="image" style="background-image: url(https://pocketgems.com/wp-content/uploads/2017/02/pocketgemsdanielterry-116x116.jpg);"></div>
<div class="user-popup">
<div class="arrow"></div>
<img alt="Close icon" class="close-icon" src="https://pocketgems.com/wp-content/themes/pocketgems/assets/images/x@2x.png"/>
<h4>Background</h4>
<p><p>Cornell Computer Science, Stanford MBA; Machine learning research (fast object detection techniques using AdaBoost derivatives; licensed to Like.com, sold to Google), VP Engineering at Smartleaf (large RoR SAAS application).</p>
</p>
<h4>Fave old school video game</h4>
<p><p>Super Mario Bros, duh</p>
</p>
<h4>If I could be a game character</h4>
<p><p>Peter Pepper in BurgerTime</p>
</p>
</div>
<figcaption>
<strong>Daniel Terry</strong> Co-Founder / CCO &amp; Executive Chairman                        </figcaption>
</figure>

In [27]:
def scrape_summary(soup):
    """
    This function is to scrape the stuffs' summary info
    Designed for Pocket Gems
    input:
        soup: beautifulsoup of the webpage
    return:
        df: DataFrame of the stuffs' summary info
    """
    names = []
    titles = []
    backgrounds = []
    fav_games = []
    game_chars = []
    
    for stuff in soup.find_all("figure", {"class": "item"}):
        try:
            name = stuff.strong.text
            background = stuff.find_all("p")[1].text
            fav_game = stuff.find_all("p")[3].text
            game_char = stuff.find_all("p")[5].text

            name_title = stuff.figcaption.text.split()        
            for n in name.split():
                name_title.remove(n)
            title = " ".join(name_title)

            names.append(name)
            titles.append(title)
            backgrounds.append(background)
            fav_games.append(fav_game)
            game_chars.append(game_char)
        # Print the HTML info if there is something wrong
        except:
            print(stuff)
    
    df = pd.DataFrame({'name': names, 'title': titles, 'background': backgrounds, 
                      'fav_game': fav_games, 'game_char': game_chars})
    
    return df    

In [28]:
df = scrape_summary(soup)

<figure class="item" template="">
<div class="image" style="background-image: url(https://s3-us-west-2.amazonaws.com/pocketgems-assets/2018/07/20032106/andykorzik_headshot.jpg);"></div>
<div class="user-popup">
<div class="arrow"></div>
<img alt="Close icon" class="close-icon" src="https://pocketgems.com/wp-content/themes/pocketgems/assets/images/x@2x.png"/>
<h4>Background</h4>
<p><p><span data-sheets-userformat='{"2":328193,"3":[null,0],"12":0,"19":1,"21":0}' data-sheets-value='{"1":2,"2":"Originally got into making games by creating Counter-Strike maps in high school. Earned a BS from Georgia Tech and Masters from Carnegie Mellon. I moved west in 2011 have been making games in the bay area since.\n\nHobbies include: RPGs, climbing, soap making, watching Survivor."}'>Originally got into making games by creating Counter-Strike maps in high school. Earned a BS from Georgia Tech and Masters from Carnegie Mellon. I moved west in 2011 have been making games in the bay area since.</span></p

In [29]:
df.head()

Unnamed: 0,name,title,background,fav_game,game_char
0,Daniel Terry,Co-Founder / CCO & Executive Chairman,"Cornell Computer Science, Stanford MBA; Machin...","Super Mario Bros, duh",Peter Pepper in BurgerTime
1,Harlan Crystal,Co-Founder / CTO,Born and raised in NYC. BA in Computer Science...,Marble Madness,Magus from Chrono Trigger
2,Fletcher,I break games,BA in Business Management and Economics from U...,Ahh there’s so many but top 3: New Zealand Sto...,Duke Nukem
3,David,Backend Hacker,Stanford MSCS; open-source enthusiast (Virtual...,"Run, and then run some more",Fomor
4,Jeff,Pixel Pusher,"First there were only 2 colors, then 4, then 1...",Temple of Apshai,My Tauren Druid


In [30]:
df.to_csv('pocket_gems.csv', index=False)