# Web Scraping
### This notebook provides the source code to scrape runs scored yearly by all ODI players from the year 1970 to 2019. The runs have been cumulated yearly, representing the total runs scored by the cricketer until that year.
### Data Source: http://www.howstat.com/cricket/Statistics/Players/PlayerMenu.asp

In [1]:
# import necessary modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import string
from datetime import datetime
import numpy as np

In [2]:
# create dataframe with desired columns and years from 1970 to 2019
cricket = pd.DataFrame(columns = ['Cricketer', 'Nationality'])
yrs = list(range(1970,2020))
years = pd.DataFrame(columns=yrs)
cricket = pd.concat([cricket, years], axis=1, sort=True)
cricket.head()

Unnamed: 0,Cricketer,Nationality,1970,1971,1972,1973,1974,1975,1976,1977,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019


We will make use of the alphabetical lists of cricketers (http://www.howstat.com/cricket/Statistics/Players/PlayerList.asp?Group=A) by changing the last letter of this url by different letters in the English alphabet from A to Z. Let us first look at the source of this page using Beautiful Soup.

In [3]:
r = requests.get("http://www.howstat.com/cricket/Statistics/Players/PlayerList.asp?Group=A")
soup = BeautifulSoup(r.text, 'html5lib')

In [4]:
print(soup.prettify())

<html>
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   HowSTAT! List of International Cricketers
  </title>
  <meta content="Cricket - Alphabetical list of international players" name="description"/>
  <link href="../../styles/howstat.css" rel="stylesheet"/>
  <script src="../../includes/JQuery.js" type="text/javascript">
  </script>
  <script type="text/javascript">
   <!--
	jQuery(document).ready(function()
	{
		var test = document.getElementById("test");
		var country = document.getElementById("cboCountry");
		var d = new Date();
		d = d.getTime();
		if (jQuery('#reloadValue').val().length == 0)
		{
						jQuery('#reloadValue').val(d);
						jQuery('body').show();
		}
		else
		{
						jQuery('#reloadValue').val('');
						location.reload();
		}
	});
  
	function btnFindPlayer_Click() 
	{
    var Player = document.frmStat.txtPlayer.value;
    var len = Player.length;
    var i;
    var ValidChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ'-, abcdefghijklmnopqrstuvwxy

Now, let's look at the cource code of the website http://www.howstat.com/cricket/Statistics/Players/PlayerYears_ODI.asp?PlayerID=2601 which displayes total runs made by a cricketer using the unique PlayerID in the url. Our task would then be to extract the PlayerID for every player who has played an ODI match.

In [5]:
r = requests.get("http://www.howstat.com/cricket/Statistics/Players/PlayerYears_ODI.asp?PlayerID=2601")
soup = BeautifulSoup(r.text, 'html5lib')

In [6]:
print(soup.prettify())

<html>
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   HowSTAT! ODI Cricket - Aamer Hameed - Performance Analysis by Year
  </title>
  <meta content=" - Summary of performances in each year - ODI Cricket" name="description"/>
  <link href="../../styles/howstat.css" rel="stylesheet"/>
  <script src="../../includes/javascript-tabs.js" type="text/javascript">
  </script>
 </head>
 <body marginheight="0" marginwidth="0">
  <table border="0" cellpadding="0" cellspacing="0" height="100%" width="100%">
   <tbody>
    <tr>
     <td bgcolor="black" colspan="3" height="17" valign="top">
      <table border="0" cellpadding="1" cellspacing="0" width="100%">
       <tbody>
        <tr>
         <td width="160">
         </td>
         <td align="left" class="TextWhite8" valign="middle">
          <a name="TopOfPage">
          </a>
          <a class="TextWhite8" href="http://www.howstat.com/cricket/AboutUs.asp">
           About Us
          </a>
          |
         

Once we have identified the tags to be used to extract data, we can begin writing functions.

In [7]:
def get_player_details(url):
    """
    takes the url of cricketer list with their names starting with a particular letter,
    extracts PlayerID of the player,
    and updates 'Cricketer' and 'Nationality' values of our dataframe
    """
    global cricket
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html5lib")
    # pattern to match PlayerID
    pattern = re.compile('ID=')
    # pattern to check if ODI scores are present, i.e., the player has played ODIs
    odi_pattern = re.compile("ODI")
    
    for tag in soup.find_all("a", attrs={"class":"LinkNormal"}):
        if "href" in tag.attrs.keys() and pattern.search(tag.attrs["href"]) and not tag.string.isdigit():
            player = tag.string
            born = tag.find_next("td")
            country = born.find_next("td")
            tests = country.find_next("td")
            odi = tests.find_next("td")
            if odi.find_next("a").string!=None:
                if odi_pattern.search(odi.find_next("a").attrs["href"]):
                    player_id = tag.attrs["href"][pattern.search(tag.attrs["href"]).span()[1]:]
                    
                    # update player name and nationality in the dataframe
                    cricket = cricket.append(pd.DataFrame({'Cricketer':player, 'Nationality':country.string.strip()}, index=[0]), sort=True)
                    # call another function to update runs scored by a player with a unique player_id
                    get_player_stats(player, player_id) 

In [8]:
def get_player_stats(player, player_id):
    """
    takes in the name of the player and his unique PlayerID,
    updates yearly scores of the player in dataframe
    """
    global cricket
    howstat = "http://www.howstat.com/cricket/Statistics/Players/PlayerYears_ODI.asp?PlayerID="+player_id
    r = requests.get(howstat)
    soup = BeautifulSoup(r.text, 'html5lib')
    # start with year 1970
    year = 1970
    for tag in soup.find_all(["a", "td"]):
        if "class" in tag.attrs.keys() and tag.attrs["class"]==["LinkNormal"]:
            # if next year is less than previous year then break the loop
            if int(tag.string) < int(year):
                break
            year = int(tag.string)
            runs = []
        elif "align" in tag.attrs.keys() and tag.attrs["align"]=="right":
            runs.append(tag.string.strip())
            if len(runs) == 8:
                try:
                    # 8th entry in the runs list is the actual Runs
                    cricket.loc[cricket['Cricketer']==player, year] = int(runs[7])
                except:
                    pass

### Let us now begin scraping the data

In [9]:
# note the starting and stopping time taken to scrape all data
start_time = datetime.now()

# for all letters in the English alphabet
for letter in string.ascii_uppercase:
    cric_url = "http://www.howstat.com/cricket/Statistics/Players/PlayerList.asp?Country=ALL&Group="+letter
    get_player_details(cric_url)
    print("********** Letter", letter, "done **********")
    
end_time = datetime.now()    

********** Letter A done **********
********** Letter B done **********
********** Letter C done **********
********** Letter D done **********
********** Letter E done **********
********** Letter F done **********
********** Letter G done **********
********** Letter H done **********
********** Letter I done **********
********** Letter J done **********
********** Letter K done **********
********** Letter L done **********
********** Letter M done **********
********** Letter N done **********
********** Letter O done **********
********** Letter P done **********
********** Letter Q done **********
********** Letter R done **********
********** Letter S done **********
********** Letter T done **********
********** Letter U done **********
********** Letter V done **********
********** Letter W done **********
********** Letter X done **********
********** Letter Y done **********
********** Letter Z done **********


In [10]:
# find time taken in minutes
minutes = (end_time - start_time).total_seconds() / 60
print("Total time taken in minutes is:", minutes)

Total time taken in minutes is: 41.145492


In [11]:
print("Number of observations and features respectively:", cricket.shape)

Number of observations and features respectively: (2547, 52)


In [12]:
# reset index and remove duplicates
cricket.reset_index(inplace=True, drop=True)
cricket.drop_duplicates(inplace=True)
print("Number of observations and features respectively:", cricket.shape)
cricket.head()

Number of observations and features respectively: (2547, 52)


Unnamed: 0,Cricketer,Nationality,1970,1971,1972,1973,1974,1975,1976,1977,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aamer Hameed,Pakistan,,,,,,,,0.0,...,,,,,,,,,,
1,Aamer Hanif,Pakistan,,,,,,,,,...,,,,,,,,,,
2,Aamer Malik,Pakistan,,,,,,,,,...,,,,,,,,,,
3,Aamer Yamin,Pakistan,,,,,,,,,...,,,,,,63.0,,,32.0,
4,Aamir Nazir,Pakistan,,,,,,,,,...,,,,,,,,,,


In [13]:
# replace NaNs with 0
cricket.replace(np.nan, 0, inplace=True)
cricket.head()

Unnamed: 0,Cricketer,Nationality,1970,1971,1972,1973,1974,1975,1976,1977,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aamer Hameed,Pakistan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aamer Hanif,Pakistan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Aamer Malik,Pakistan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Aamer Yamin,Pakistan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,63,0,0,32,0
4,Aamir Nazir,Pakistan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# cumulify column values
for i in range(3,len(cricket.columns)):
    cricket[cricket.columns[i]] += cricket[cricket.columns[i-1]]
    
cricket.head(10)

Unnamed: 0,Cricketer,Nationality,1970,1971,1972,1973,1974,1975,1976,1977,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aamer Hameed,Pakistan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Aamer Hanif,Pakistan,0,0,0,0,0,0,0,0,...,89,89,89,89,89,89,89,89,89,89
2,Aamer Malik,Pakistan,0,0,0,0,0,0,0,0,...,556,556,556,556,556,556,556,556,556,556
3,Aamer Yamin,Pakistan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,63,63,63,95,95
4,Aamir Nazir,Pakistan,0,0,0,0,0,0,0,0,...,13,13,13,13,13,13,13,13,13,13
5,Aamir Sohail,Pakistan,0,0,0,0,0,0,0,0,...,4780,4780,4780,4780,4780,4780,4780,4780,4780,4780
6,Aaqib Javed,Pakistan,0,0,0,0,0,0,0,0,...,267,267,267,267,267,267,267,267,267,267
7,Aarif Sheikh,Nepal,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,19,94
8,"Aaron, V R",India,0,0,0,0,0,0,0,0,...,0,6,6,6,8,8,8,8,8,8
9,"Ababu, J",Kenya,0,0,0,0,0,0,0,0,...,29,29,29,29,29,29,29,29,29,29


In [15]:
# store in csv file
cricket.to_csv("ODI data.csv")