In [90]:
import json
import re
import requests
import scrapy

In [91]:
headers = {'User-Agent': 'UNC Journo Class'}

In [92]:
base_url = 'http://goheels.com'
url = base_url + '/roster.aspx?path=baseball'

In [93]:
resp = requests.get(url, headers=headers)

In [94]:
body_str = resp.content.decode('utf-8')

In [95]:
sel = scrapy.Selector(text=body_str)

In [96]:
table = sel.css('.sidearm-table')[0]

In [97]:
table

<Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' sidearm-table ')]" data='<table class="sidearm-table sidearm-tabl'>

In [98]:
cols = table.css('th').xpath('string()').extract()

In [99]:
cols

['#',
 'Full Name',
 'Pos.',
 'Ht.',
 'Wt.',
 'Academic Year',
 'Hometown / High School']

In [100]:
rows = table.css('tr')[1:]

In [101]:
players = []
for r in rows:
    data = {}
    for i, d in enumerate(r.css('td')):
        a = d.css('a')
        if a:
            t = a.xpath('text()').extract()[0]
            data['href'] = a.xpath('@href').extract()[0]
        else:
            t = d.xpath('text()').extract()[0]
        data[cols[i]] = t
    players.append(data)

In [102]:
players

[{'#': '1',
  'Academic Year': 'Jr.',
  'Full Name': 'Brandon Riley',
  'Hometown / High School': 'Burlington, N.C. / Williams',
  'Ht.': '6-0',
  'Pos.': 'OF',
  'Wt.': '175',
  'href': '/roster.aspx?rp_id=14221'},
 {'#': '2',
  'Academic Year': 'Fr.',
  'Full Name': 'Satchel Jerzembeck',
  'Hometown / High School': 'Charlotte, N.C. / Providence',
  'Ht.': '5-10',
  'Pos.': 'IF',
  'Wt.': '150',
  'href': '/roster.aspx?rp_id=14226'},
 {'#': '3',
  'Academic Year': 'Jr.',
  'Full Name': 'Kyle Datres',
  'Hometown / High School': 'Williamsport, Pa. / Loyalsock Township',
  'Ht.': '6-0',
  'Pos.': '3B',
  'Wt.': '198',
  'href': '/roster.aspx?rp_id=14211'},
 {'#': '4',
  'Academic Year': 'So.',
  'Full Name': 'Brandon Martorano',
  'Hometown / High School': 'Marlboro, N.J. / Christian Brothers Academy',
  'Ht.': '6-2',
  'Pos.': 'C/OF',
  'Wt.': '187',
  'href': '/roster.aspx?rp_id=14219'},
 {'#': '5',
  'Academic Year': 'So.',
  'Full Name': 'Ashton McGee',
  'Hometown / High School': '

In [103]:
def fetch_bio(player):
    player_url = base_url + player['href']
    resp = requests.get(player_url, headers=headers)
    player_txt = resp.content.decode('utf-8')
    sel = scrapy.Selector(text=player_txt)
    player['sel'] = sel
    player['bio'] = sel.css('#sidearm-roster-player-bio').xpath('string()').extract()[0]
    player['img'] = sel.css('.sidearm-roster-player-image img').xpath('@src').extract()[0]

In [104]:
js_obj_rx = re.compile(r'.*?responsive-roster-bio\.ashx.*?(?P<obj>{.*?})')

In [105]:
def fetch_stats(player):
    text = player['sel'].xpath('string()').extract()[0]
    parts = text.split('$.getJSON("/services/')[1:]
    captured = js_obj_rx.findall(''.join(parts))
    clean_objs = []
    for obj_str in captured:
        if 'stats' not in obj_str:
            continue

        obj_str = obj_str.replace('{', '').replace('}', '')
        obj_str = obj_str.replace("'", '').replace('"', '')
        obj_pairs = obj_str.split(',')
        obj_pairs = [x.split(":") for x in obj_pairs]
        clean_pairs = []
        for pair in obj_pairs:
            clean_pairs.append(['"{}"'.format(p.strip()) for p in pair])
        colonized = [":".join(p) for p in clean_pairs]
        commas = ','.join(colonized)
        json_str = "{" + commas + "}"
        clean_objs.append(json.loads(json_str))
    
    player['stats_url'] = stats_url = (
        "http://goheels.com/services/responsive-roster-bio.ashx?"
        "type={type}&rp_id={rp_id}&path={path}&year={year}"
        "&player_id={player_id}"
    ).format(**clean_objs[0])
    
    print('Fetch stats', stats_url)
    resp = requests.get(stats_url, headers=headers)
    json_stats = json.loads(resp.content.decode("utf-8"))
    player['raw_stats'] = json_stats

In [106]:
for p in players:
    fetch_bio(p)
    fetch_stats(p)

Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14221&path=baseball&year=2018&player_id=3746
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14226&path=baseball&year=2018&player_id=0
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14211&path=baseball&year=2018&player_id=3736
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14219&path=baseball&year=2018&player_id=3760
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14220&path=baseball&year=2018&player_id=3761
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14228&path=baseball&year=2018&player_id=5394
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14229&path=baseball&year=2018&player_id=5395
Fetch stats http://goheels.com/services/responsive-roster-bio.ashx?type=stats&rp_id=14212&path=baseb

In [107]:
players[1]

{'#': '2',
 'Academic Year': 'Fr.',
 'Full Name': 'Satchel Jerzembeck',
 'Hometown / High School': 'Charlotte, N.C. / Providence',
 'Ht.': '5-10',
 'Pos.': 'IF',
 'Wt.': '150',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            Providence High School\r\nNo. 8 shortstop in North Carolina • 2017 Rawlings-Perfect Game Honorable Mention All-American • All-Atlantic Region Second Team • Led Providence to 2015 state championship • Coached by Danny Hignight.\r\n\r\nPersonal\r\nSon of former Tar Heel pitcher Mike Jerzembeck who was drafted by the New York Yankees in 1993 • Named after Satchel Paige • Son of Alison and Mike Jerzembeck • Born on July 7 • Has yet to declare a major at Carolina\r\n\r\n                        \r\n                    ',
 'href': '/roster.aspx?rp_id=14226',
 'img': '/images/2018/1/31/Jerzembeck_Satchel_bb_40.jpg?width=300',
 'raw_stats': {'career_stats': None,
  'current_stats': '',
  '

In [108]:
p = [p for p in players if p['Full Name'] == 'Tyler Baum'][0]

In [109]:
txt = p['raw_stats']['career_stats']

In [110]:
sel = scrapy.Selector(text=txt)

In [111]:
sel.css('section')

[<Selector xpath='descendant-or-self::section' data='<section>\r\n                <h5>Pitching '>,
 <Selector xpath='descendant-or-self::section' data='<section>\r\n                <h5>Hitting S'>]

In [112]:
#[x.pop('stats') for x in players]

In [113]:
def parse_stats(player):
    stats = {}
    for raw_key, raw_val in player['raw_stats'].items():
        txt = player['raw_stats'][raw_key]
        if not txt:
            print('Skipping {} for {}'.format(raw_key, player['Full Name']))
            continue
        sel = scrapy.Selector(text=txt)
        # Get all the tables
        for section in sel.css('section'):
            title = section.css('h5').xpath('string()').extract()[0]
            cols = section.css('tr')[0].css('th').xpath('string()').extract()
            print('NEW SECTION', title)
            print('COLS', cols)
            these_stats = []
            print('TRS', section.css('tr'))
            for r in section.css('tr')[1:]:
                print('row', r.xpath('string()').extract()[0].replace('\r', '').replace('\n', '').strip())
                s = {}
                for i, d in enumerate(r.css('td'), 0):
                    s[cols[i].lower()] = d.xpath('string()').extract()[0]
                yr = r.css('th').xpath('string()')
                if yr:
                    yr = yr.extract()[0]
                    if yr.lower() in ('total', 'season'):
                        print('SKIPPING...')
                        continue
                    print('THE YR IS', yr)
                    s['year'] = yr
                these_stats.append(s)
                print('THE STATS ARE', these_stats)
            existing = stats.get(raw_key, {})
            existing[title] = these_stats
            stats[raw_key] = existing
    player['stats'] = stats

In [114]:
p = [p for p in players if p['Full Name'] == 'Tyler Baum'][0]
parse_stats(p)

NEW SECTION Pitching Statistics
COLS ['Date', 'Opponent', 'W/L', 'IP', 'H', 'R', 'ER', 'BB', 'SO', '2B', '3B', 'HR', 'WP', 'BK', 'HBP', 'IBB', 'NP', 'SCORE', 'W', 'L', 'SV', 'G-ERA', 'S-ERA']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/24/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>3/2/2018</td>\r\n\t\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>3/9/2018</td>\r\n\t\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td></td>\r\n\t\t\t\t\t\t\t\t\t<td>T'>]
row 2/17/2018									 at  Bulls									W									5.1									4									2									2									1									10									1									0									0									1						

In [115]:
for p in players:
    parse_stats(p)

NEW SECTION Hitting Statistics
COLS ['Date', 'Opponent', 'W/L', 'GS', 'AB', 'R', 'H', 'RBI', '2B', '3B', 'HR', 'BB', 'IBB', 'SB', 'SBA', 'CS', 'HBP', 'SH', 'SF', 'GDP', 'K', 'AVG']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/16/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/20/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/21/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/23/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t

TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/16/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/20/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/21/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/23/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/24/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/25/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='desce

THE YR IS  St. John's
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '1', 'gs': '5', 'ab': '1', 'r': '1', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '1', 'gdp': '1', 'k': '.200', 'year': ' at  Bulls'}, {'date': '2/17/2018', 'opponent': 'W', 'w/l': '1', 'gs': '6', 'ab': '3', 'r': '3', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '0', 'gdp': '0', 'k': '.364', 'year': ' at  Bulls'}, {'date': '2/18/2018', 'opponent': 'W', 'w/l': '1', 'gs': '5', 'ab': '1', 'r': '1', 'h': '1', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '0', 'gdp': '0', 'k': '.313', 'year': ' at  Bulls'}, {'date': '2/20/2018', 'opponent': 'L', 'w/l': '1', 'gs': '5', 'ab': '1', 'r': '3', 'h': '1', 'rbi': '2', '2b': '0', '3b': '0', 'hr':

row 2/27/2018									 High Point									W									3									1									2									0									1.000									0									0									0									0									0
THE YR IS  High Point
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '4', 'c': '1', 'po': '3', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/17/2018', 'opponent': 'W', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/18/2018', 'opponent': 'W', 'w/l': '2', 'c': '1', 'po': '1', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/20/2018', 'opponent': 'L', 'w/l': '6', 'c': '3', 'po': '3', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' UNCW'}, {'date': '2/21/2018', 'opponent': 'L', 'w/l': '5', 'c': '2', 'po': '3', 'a': '0', 'e': '1.000', 'fld%': '0',

THE YR IS  LIBERTY
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/17/2018', 'opponent': 'W', 'w/l': '14', 'c': '11', 'po': '3', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '1', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/18/2018', 'opponent': 'W', 'w/l': '11', 'c': '10', 'po': '1', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '1', 'csb': '1', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/20/2018', 'opponent': 'L', 'w/l': '2', 'c': '2', 'po': '0', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' UNCW'}, {'date': '2/23/2018', 'opponent': 'L', 'w/l': '13', 'c': '12', 'po': '1', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '2', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  ECU'}, {'date': '2/24/2018', 'opponent': 'W', 'w/l': '0', 'c': '0', 'po': '0', 'a': 

THE YR IS  LIBERTY
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '1', 'gs': '5', 'ab': '0', 'r': '1', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '0', 'gdp': '4', 'k': '.200', 'year': ' at  Bulls'}, {'date': '2/18/2018', 'opponent': 'W', 'w/l': '1', 'gs': '3', 'ab': '0', 'r': '0', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '0', 'gdp': '2', 'k': '.125', 'year': ' at  Bulls'}, {'date': '2/20/2018', 'opponent': 'L', 'w/l': '1', 'gs': '3', 'ab': '0', 'r': '0', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '1', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '0', 'gdp': '1', 'k': '.091', 'year': ' UNCW'}, {'date': '2/21/2018', 'opponent': 'L', 'w/l': '1', 'gs': '3', 'ab': '1', 'r': '1', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '2', 'b

THE YR IS  at  Bulls
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '1', 'c': '1', 'po': '0', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/17/2018', 'opponent': 'W', 'w/l': '3', 'c': '1', 'po': '2', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/18/2018', 'opponent': 'W', 'w/l': '6', 'c': '5', 'po': '1', 'a': '0', 'e': '1.000', 'fld%': '1', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}]
row 2/21/2018									 St. John's									L									5									3									1									1									.800									0									0									0									0									0
THE YR IS  St. John's
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '1', 'c': '1', 'po': '0', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/17/2018', 'opponent': 'W', 'w/l': '3', 'c': '1', 'po': '

row 2/17/2018									 at  Bulls									W									0									1									0									0									0									0									0									0									0									0									0									0									0									0									0									0									0									1									.000
THE YR IS  at  Bulls
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '0', 'gs': '1', 'ab': '0', 'r': '0', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '0', 'gdp': '1', 'k': '.000', 'year': ' at  Bulls'}]
row 2/18/2018									 at  Bulls									W									0									1									0									0									0									0									0									0									0									0									0									0									0									0									0									0									0									0									.000
THE YR IS  at  Bulls
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '0', 'gs': '1', 'ab': '0', 'r': '0', 'h': '0', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': 

THE YR IS  LIBERTY
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/17/2018', 'opponent': 'W', 'w/l': '1', 'c': '0', 'po': '1', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/18/2018', 'opponent': 'W', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/20/2018', 'opponent': 'L', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' UNCW'}, {'date': '2/21/2018', 'opponent': 'L', 'w/l': '1', 'c': '1', 'po': '0', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': " St. John's"}, {'date': '2/24/2018', 'opponent': 'W', 'w/l': '1', 'c': '1', 'po': '0', 'a': '0'

NEW SECTION Season Highs
COLS ['Statistic', 'Value', 'Opponent']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                    <th class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Hits</'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Double'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Triple'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Home R'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Runs S'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Runs B'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Bases '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Assist'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Putout'>]


NEW SECTION Fielding Statistics
COLS ['Date', 'Opponent', 'W/L', 'C', 'PO', 'A', 'E', 'FLD%', 'DP', 'SBA', 'CSB', 'PB', 'CI']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/16/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/20/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/21/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/23/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/24/2018</td>\r\n\t\t\t\t\t'>, <Sele

TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/16/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/20/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/21/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/23/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/24/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/25/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='desce

row 2/27/2018									 High Point									W									1									3									1									1									3									0									0									0									0									0									0									0									0									0									0									1									0									0									.371
THE YR IS  High Point
THE STATS ARE [{'date': '2/16/2018', 'opponent': 'L', 'w/l': '1', 'gs': '5', 'ab': '1', 'r': '2', 'h': '3', 'rbi': '0', '2b': '0', '3b': '1', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '0', 'sf': '0', 'gdp': '0', 'k': '.400', 'year': ' at  Bulls'}, {'date': '2/17/2018', 'opponent': 'W', 'w/l': '1', 'gs': '3', 'ab': '1', 'r': '2', 'h': '3', 'rbi': '0', '2b': '0', '3b': '0', 'hr': '1', 'bb': '0', 'ibb': '0', 'sb': '0', 'sba': '0', 'cs': '0', 'hbp': '0', 'sh': '1', 'sf': '0', 'gdp': '0', 'k': '.500', 'year': ' at  Bulls'}, {'date': '2/18/2018', 'opponent': 'W', 'w/l': '1', 'gs': '5', 'ab': '1', 'r': '1', 'h': '1', 'rbi': '1', '2b': '0', '3b': '0', 'hr': '0', 'bb': '0', 'ibb': '0', 'sb': '0', 

COLS ['Season', 'AVG', 'GP', 'GS', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SLG%', 'BB', 'HBP', 'SO', 'GDP', 'OB%', 'SF', 'SH', 'SB', 'PO', 'A', 'E', 'FLD%']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                <t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                <t'>]
row 2018                                    .375                                    5                                    2                                    8                                    2                                    3                                    1                                    0                                    0                                    1                                    .500                                    0                                    0                                    1                         

THE YR IS  at  Bulls
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '1', 'c': '1', 'po': '0', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '1', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}]
row 2/24/2018									 vs  ECU									W									0									0									0									0									0.000									0									1									0									0									0
THE YR IS  vs  ECU
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '1', 'c': '1', 'po': '0', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '1', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/24/2018', 'opponent': 'W', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '1', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' vs  ECU'}]
row 3/2/2018									 LIBERTY									W									0									0									0									0									0.000									0									0									0									0									0
THE YR IS  LIBERTY
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '1', 'c': '1', 'po': '0', 'a'

THE YR IS  at  Bulls
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}]
row 2/21/2018									 St. John's									L									1									0									1									0									1.000									0									0									0									0									0
THE YR IS  St. John's
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '0', 'c': '0', 'po': '0', 'a': '0', 'e': '0.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': ' at  Bulls'}, {'date': '2/21/2018', 'opponent': 'L', 'w/l': '1', 'c': '0', 'po': '1', 'a': '0', 'e': '1.000', 'fld%': '0', 'dp': '0', 'sba': '0', 'csb': '0', 'pb': '0', 'year': " St. John's"}]
row 2/27/2018									 High Point									W									1									1									0									0									1.000									0									0									0									0									0
THE YR IS  High Point
THE STATS ARE [{'date': '2/17/2018', 'opponent': 'W', 'w/l': '0', 'c': '0'

NEW SECTION Pitching Statistics
COLS ['Date', 'Opponent', 'W/L', 'IP', 'H', 'R', 'ER', 'BB', 'SO', '2B', '3B', 'HR', 'WP', 'BK', 'HBP', 'IBB', 'NP', 'SCORE', 'W', 'L', 'SV', 'G-ERA', 'S-ERA']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/17/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/21/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>3/2/2018</td>\r\n\t\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td></td>\r\n\t\t\t\t\t\t\t\t\t<td>T'>]
row 2/17/2018									 at  Bulls									W									0.2									0									0									0									0									2									0									0									0									0								

THE STATS ARE [{'date': '2/17/2018', 'opponent': ' at  Bulls', 'w/l': 'W', 'ip': '1.0', 'h': '0', 'r': '0', 'er': '0', 'bb': '0', 'so': '2', '2b': '0', '3b': '0', 'hr': '0', 'wp': '0', 'bk': '0', 'hbp': '0', 'ibb': '0', 'np': '14', 'score': '12-5', 'w': '0', 'l': '0', 'sv': '0', 'g-era': '0.00', 's-era': '0.00'}, {'date': '2/20/2018', 'opponent': ' UNCW', 'w/l': 'L', 'ip': '4.1', 'h': '3', 'r': '1', 'er': '1', 'bb': '2', 'so': '7', '2b': '1', '3b': '0', 'hr': '0', 'wp': '0', 'bk': '1', 'hbp': '0', 'ibb': '0', 'np': '72', 'score': '4-5', 'w': '0', 'l': '0', 'sv': '0', 'g-era': '2.08', 's-era': '1.69'}, {'date': '2/25/2018', 'opponent': ' ECU', 'w/l': 'L', 'ip': '0.1', 'h': '1', 'r': '1', 'er': '0', 'bb': '1', 'so': '1', '2b': '0', '3b': '0', 'hr': '0', 'wp': '0', 'bk': '0', 'hbp': '0', 'ibb': '0', 'np': '13', 'score': '0-12', 'w': '0', 'l': '0', 'sv': '0', 'g-era': '0.00', 's-era': '1.59'}]
row 2/27/2018									 High Point									W									1.2									0									0									0									1		

row IP                        1.2                        vs. St. John's
THE STATS ARE [{'statistic': 'IP', 'value': '1.2', 'opponent': "vs. St. John's"}]
row Strikeouts                        3                        vs. ECU
THE STATS ARE [{'statistic': 'IP', 'value': '1.2', 'opponent': "vs. St. John's"}, {'statistic': 'Strikeouts', 'value': '3', 'opponent': 'vs. ECU'}]
row Walks (Low)                        0                        vs. High Point
THE STATS ARE [{'statistic': 'IP', 'value': '1.2', 'opponent': "vs. St. John's"}, {'statistic': 'Strikeouts', 'value': '3', 'opponent': 'vs. ECU'}, {'statistic': 'Walks (Low)', 'value': '0', 'opponent': 'vs. High Point'}]
row Hits (Low)                        0                        vs. High Point
THE STATS ARE [{'statistic': 'IP', 'value': '1.2', 'opponent': "vs. St. John's"}, {'statistic': 'Strikeouts', 'value': '3', 'opponent': 'vs. ECU'}, {'statistic': 'Walks (Low)', 'value': '0', 'opponent': 'vs. High Point'}, {'statistic': 'Hits (Low)'

NEW SECTION Pitching Statistics
COLS ['Season', 'ERA', 'W', 'L', 'APP', 'GS', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'BB', 'SO', '2B', '3B', 'HR', 'BF', 'BAVG', 'WP', 'HBP', 'BK', 'SFA', 'SHA']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                <t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                <t'>]
row 2018                                    0.00                                    0                                    0                                    2                                    0                                    0                                    0                                    0                                    2.1                                    0                                    0                                    0                                    2                          

row Bases Stolen                        0                        vs. Louisville
THE STATS ARE [{'statistic': 'Hits', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Doubles', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Triples', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Home Runs', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Runs Scored', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Runs Batted In', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Bases Stolen', 'value': '0', 'opponent': 'vs. Louisville'}]
row Assists                        0                        vs. Louisville
THE STATS ARE [{'statistic': 'Hits', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Doubles', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Triples', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Home Runs', 'value': '0', 'opponent': 'vs. Louisville'}, {'statistic': 'Runs Scored', 'v

TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                    <th class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Hits</'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Double'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Triple'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Home R'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Runs S'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Runs B'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Bases '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Assist'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                        <td>Putout'>]
row Hits                        0                        vs. LIBE

NEW SECTION Hitting Statistics
COLS ['Season', 'AVG', 'GP', 'GS', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SLG%', 'BB', 'HBP', 'SO', 'GDP', 'OB%', 'SF', 'SH', 'SB', 'PO', 'A', 'E', 'FLD%']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                <t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                  '>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n                                <t'>]
row 2018                                    .333                                    4                                    0                                    3                                    1                                    1                                    1                                    0                                    0                                    0                                    .667                                    1                                    0                               

In [117]:
[p for p in players if p['Full Name'] == 'Tyler Baum'][0]
parse_stats(p)

NEW SECTION Pitching Statistics
COLS ['Date', 'Opponent', 'W/L', 'IP', 'H', 'R', 'ER', 'BB', 'SO', '2B', '3B', 'HR', 'WP', 'BK', 'HBP', 'IBB', 'NP', 'SCORE', 'W', 'L', 'SV', 'G-ERA', 'S-ERA']
TRS [<Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t<th scope="col" class="tex'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/16/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/18/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/20/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>2/25/2018</td>\r\n\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td>3/4/2018</td>\r\n\t\t\t\t\t\t'>, <Selector xpath='descendant-or-self::tr' data='<tr>\r\n\t\t\t\t\t\t\t\t\t<td></td>\r\n\t\t\t\t\t\t\t\t\t<td>T'>]
row 2/16/2018									 at  Bulls									L	

In [132]:
to_dump = [p.copy() for p in players]
for p in to_dump:
    p.pop('sel')
    for k in list(p.keys()):
        if 'raw' in k:
            p.pop(k)
with open('scraped_players.json', 'w') as f:
    json.dump(to_dump, f)

In [127]:
cat scraped_players.json | cut -c 1-100

[{"#": "1", "href": "/roster.aspx?rp_id=14221", "Full Name": "Brandon Riley", "Pos.": "OF", "Ht.": "


In [130]:
to_dump[scraped players]

{'#': '1',
 'Academic Year': 'Jr.',
 'Full Name': 'Brandon Riley',
 'Hometown / High School': 'Burlington, N.C. / Williams',
 'Ht.': '6-0',
 'Pos.': 'OF',
 'Wt.': '175',
 'bio': '\r\n                        Biography\r\n                                                    \r\n                            Dynamic outfielder who figures to be among the team leaders as a junior.\r\n\r\nSophomore Season (2017)\r\nHit .317 with seven homers, nine doubles and a team-high 52 RBIs •\xa0Also scored 46 runs, drew 37 walks and led the team with five triples, which is tied for 10th-most in UNC history for a single season • Appeared in all 63 games and started 62, primarily in right field • Stole 10 bases in 14 attempts • Had 21 multi-hit games, including a season-high three hits on eight occasions • Also drove in multiple runs 14 times, including a season-best four at ECU on March 22 • Followed up the ECU game with a big series against Miami, recording six hits and driving in four runs with a homer 