# Get Rugby Data
Using requests and BeautifulSoup to scrape rugbydata.com

In [96]:
"""
Import required packages
"""
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import html5lib

In [97]:
# Set first Url, then generalise as a function to pass in multiple and loop
url = "http://www.rugbydata.com/superrugby/hurricanes/gamesplayed/"

In [98]:
# Get request
r = requests.get(url)

In [99]:
page = r.text
soup = bs(page)

In [100]:
table = soup.find_all('table')
rows = table[0].find_all('tr')[1:]

In [101]:
data = {
    'Date' : [],
    'TeamA' : [],
    'Score' : [],
    'TeamB' : [],
}

In [102]:
for row in rows:
    cols = row.find_all('td')
    data['Date'].append(cols[0].get_text())
    data['TeamA'].append(cols[1].get_text())
    data['Score'].append(cols[2].get_text())
    data['TeamB'].append(cols[3].get_text())

In [103]:
# print data

In [104]:
game_data = pd.DataFrame(data)
game_data.head()

Unnamed: 0,Date,Score,TeamA,TeamB
0,15 Mar 2014,60 - 2760 - 27Hurricanes vs Cheetahs,Hurricanes,Cheetahs
1,07 Mar 2014,21 - 2921 - 29Hurricanes vs Brumbies,Hurricanes,Brumbies
2,06 Jul 2013,44 - 4944 - 49Hurricanes vs Highlanders,Hurricanes,Highlanders
3,16 May 2013,12 - 1712 - 17Hurricanes vs Chiefs,Hurricanes,Chiefs
4,26 Apr 2013,16 - 1816 - 18Hurricanes vs Stormers,Hurricanes,Stormers


In [105]:
game_data.to_csv('hurricanes')

In [106]:
"""
Now we look for a more overview of the team to calculate winning proportions and
get other statistics that are valuable.
"""

header = soup.find_all('div', {'class' : 'rdfactheader'})
detail = soup.find_all('div', {'class' : 'rdfactdetail'})
print header[0].text
print detail[0].ext

Games Played
None


In [107]:
team_data = {
    "GamesPlayed" : detail[0].get_text(),
    "GamesWon" : detail[1].get_text(),
    "GamesLost" : detail[2].get_text(),
    "LongestWinningStreak" : detail[3].get_text(),
    "LongestLosingStreak" : detail[4].get_text(),
    "TeamsBeaten" : detail[5].get_text(),
    "TeamsBeatenBy" : detail[6].get_text(),
    "TeamsDrawnWith" : detail[7].get_text(),
    "GroundPlayedAt" : detail[8].get_text(),
    "LargestPointsFor" : detail[9].get_text(),
    "LargestPointsAgainst" : detail[10].get_text(),
    "LargestWinningMargin" : detail[11].get_text(),
    "LargestLosingMargin" : detail[12].get_text(),
    "TotalPointsFor" : detail[13].get_text(),
    "AvgPointsFor" : detail[14].get_text(),
    "TotalPointsAgainst" : detail[15].get_text(),
    "AvgePointsAgainst" : detail[16].get_text(),
    "TotalPointsDifference" : detail[17].get_text(),
    "AvgPointsDifference" : detail[18].get_text()    
}

In [108]:
print team_data

{'LongestWinningStreak': u'7', 'GroundPlayedAt': u'13', 'TotalPointsDifference': u'6377', 'TeamsDrawnWith': u'15', 'TotalPointsFor': u'49', 'LargestPointsAgainst': u'0', 'AvgPointsDifference': u'25.11', 'TeamsBeaten': u'8', 'LongestLosingStreak': u'7', 'GamesWon': u'131', 'TeamsBeatenBy': u'15', 'GamesLost': u'116', 'TotalPointsAgainst': u'6539', 'LargestWinningMargin': u'66', 'LargestLosingMargin': u'60', 'AvgePointsAgainst': u'25.74', 'GamesPlayed': u'254', 'AvgPointsFor': u'-53', 'LargestPointsFor': u'4'}


In [109]:
panda_team_data = pd.DataFrame(team_data, index = ["Hurricanes"])

In [110]:
panda_team_data

Unnamed: 0,AvgPointsDifference,AvgPointsFor,AvgePointsAgainst,GamesLost,GamesPlayed,GamesWon,GroundPlayedAt,LargestLosingMargin,LargestPointsAgainst,LargestPointsFor,LargestWinningMargin,LongestLosingStreak,LongestWinningStreak,TeamsBeaten,TeamsBeatenBy,TeamsDrawnWith,TotalPointsAgainst,TotalPointsDifference,TotalPointsFor
Hurricanes,25.11,-53,25.74,116,254,131,13,60,0,4,66,7,7,8,15,15,6539,6377,49


In [111]:
# confirm successful
def check_successful(response):
    """
    Function to check if the response is successful or not.
    """
    if response.status_code == 200:
        print "Successful Url!"
    else:
        print "Check Url"
        
print check_successful(r)

Successful Url!
None


In [112]:
def get_team_data(url, team_name):
    r = requests.get(url)
    page = r.text
    soup = bs(page)
    
    header = soup.find_all('div', {'class' : 'rdfactheader'})
    detail = soup.find_all('div', {'class' : 'rdfactdetail'})
    
    team_data = {
        "GamesPlayed" : detail[0].get_text(),
        "GamesWon" : detail[1].get_text(),
        "GamesLost" : detail[2].get_text(),
        "GamesDrawn" : detail[3].get_text(),
        "LongestWinningStreak" : detail[4].get_text(),
        "LongestLosingStreak" : detail[5].get_text(),
        "TeamsPlayed" : detail[6].get_text(),
        "TeamsBeaten" : detail[7].get_text(),
        "TeamsBeatenBy" : detail[8].get_text(),
        "TeamsDrawnWith" : detail[9].get_text(),
        "GroundPlayedAt" : detail[10].get_text(),
        "LargestPointsFor" : detail[11].get_text(),
        "LargestPointsAgainst" : detail[12].get_text(),
        "LargestWinningMargin" : detail[13].get_text(),
        "LargestLosingMargin" : detail[14].get_text(),
        "TotalPointsFor" : detail[15].get_text(),
        "AvgPointsFor" : detail[16].get_text(),
        "TotalPointsAgainst" : detail[17].get_text(),
        "AvgePointsAgainst" : detail[18].get_text(),
        "TotalPointsDifference" : detail[19].get_text(),
        "AvgPointsDifference" : detail[20].get_text()    
    }
    panda_team_data = pd.DataFrame(team_data, index = [team_name])
    panda_team_data.to_csv(team_name + ".csv")
    

In [113]:
team_url = {
    "Hurricanes" : "http://www.rugbydata.com/superrugby/hurricanes/gamesplayed/",
    "Blues" : "http://www.rugbydata.com/superrugby/blues/gamesplayed/",
    "Brumbies" : "http://www.rugbydata.com/superrugby/brumbies/gamesplayed/",
    "Bulls" : "http://www.rugbydata.com/superrugby/bulls/gamesplayed/",
    "Cheetahs" : "http://www.rugbydata.com/superrugby/cheetahs/gamesplayed/",
    "Chiefs" : "http://www.rugbydata.com/superrugby/chiefs/gamesplayed/",
    "Crusaders" : "http://www.rugbydata.com/superrugby/crusaders/gamesplayed/",
    "Highlanders" : "http://www.rugbydata.com/superrugby/highlanders/gamesplayed/",
    "Lions" : "http://www.rugbydata.com/superrugby/lions/gamesplayed/",
    "Rebels" : "http://www.rugbydata.com/superrugby/melbournerebels/gamesplayed/",
    "Reds" : "http://www.rugbydata.com/superrugby/reds/gamesplayed/",
    "Sharks" : "http://www.rugbydata.com/superrugby/sharks/gamesplayed/",
    "Kings" : "http://www.rugbydata.com/superrugby/southernkings/gamesplayed/",
    "Stormers" : "http://www.rugbydata.com/superrugby/stormers/gamesplayed/",
    "Stormers" : "http://www.rugbydata.com/superrugby/stormers/gamesplayed/",
    "Waratahs" : "http://www.rugbydata.com/superrugby/waratahs/gamesplayed/",
    "Force" : "http://www.rugbydata.com/superrugby/westernforce/gamesplayed/"
}

In [114]:
for key, value in team_url.iteritems():
    get_team_data(value, key)