## Reading HTML tables

In [36]:
!pip install lxml



In [37]:
import pandas as pd

In [38]:
html_string = """
<table>
    <thead>
        <tr>
            <th>Order date</th>
            <th>Region</th> 
            <th>Item</th>
            <th>Units</th>
            <th>Unit cost</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>1/6/2018</td>
            <td>East</td> 
            <td>Pencil</td>
            <td>95</td>
            <td>1.99</td>
        </tr>
        <tr>
            <td>1/23/2018</td>
            <td>Central</td> 
            <td>Binder</td>
            <td>50</td>
            <td>19.99</td>
        </tr>
        <tr>
            <td>2/9/2018</td>
            <td>Central</td> 
            <td>Pencil</td>
            <td>36</td>
            <td>4.99</td>
        </tr>
        <tr>
            <td>3/15/2018</td>
            <td>West</td> 
            <td>Pen</td>
            <td>27</td>
            <td>19.99</td>
        </tr>
    </tbody>
</table>
"""

dfs = pd.read_html(html_string)
dfs

  dfs = pd.read_html(html_string)


[  Order date   Region    Item  Units  Unit cost
 0   1/6/2018     East  Pencil     95       1.99
 1  1/23/2018  Central  Binder     50      19.99
 2   2/9/2018  Central  Pencil     36       4.99
 3  3/15/2018     West     Pen     27      19.99]

The `read_html` can return multiples tables

In [39]:
len(dfs) # in this case, it returned only one

1

In [40]:
df = dfs[0] # getting the first (and only, in this case) table
df

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


#### Defining a header

Some  tables don't have the header tag

In [41]:
html_string = """
<table>
    <tr>
        <td>Order date</td>
        <td>Region</td> 
        <td>Item</td>
        <td>Units</td>
        <td>Unit cost</td>
    </tr>
    <tr>
        <td>1/6/2018</td>
        <td>East</td> 
        <td>Pencil</td>
        <td>95</td>
        <td>1.99</td>
    </tr>
    <tr>
        <td>1/23/2018</td>
        <td>Central</td> 
        <td>Binder</td>
        <td>50</td>
        <td>19.99</td>
    </tr>
    <tr>
        <td>2/9/2018</td>
        <td>Central</td> 
        <td>Pencil</td>
        <td>36</td>
        <td>4.99</td>
    </tr>
    <tr>
        <td>3/15/2018</td>
        <td>West</td> 
        <td>Pen</td>
        <td>27</td>
        <td>19.99</td>
    </tr>
</table>
"""

pd.read_html(html_string)[0]

  pd.read_html(html_string)[0]


Unnamed: 0,0,1,2,3,4
0,Order date,Region,Item,Units,Unit cost
1,1/6/2018,East,Pencil,95,1.99
2,1/23/2018,Central,Binder,50,19.99
3,2/9/2018,Central,Pencil,36,4.99
4,3/15/2018,West,Pen,27,19.99


We can set the first line as the header:

In [42]:
pd.read_html(html_string, header=0)[0]

  pd.read_html(html_string, header=0)[0]


Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


### Parding HTML tables from the web

In [43]:
html_url = "https://www.basketball-reference.com/leagues/NBA_2019_per_game.html"

In [44]:
nba_tables = pd.read_html(html_url)
nba_tables, len(nba_tables)

([      Rk        Player Pos Age   Tm   G  GS    MP   FG   FGA  ...   FT%  ORB  \
  0      1  Álex Abrines  SG  25  OKC  31   2  19.0  1.8   5.1  ...  .923  0.2   
  1      2    Quincy Acy  PF  28  PHO  10   0  12.3  0.4   1.8  ...  .700  0.3   
  2      3  Jaylen Adams  PG  22  ATL  34   1  12.6  1.1   3.2  ...  .778  0.3   
  3      4  Steven Adams   C  25  OKC  80  80  33.4  6.0  10.1  ...  .500  4.9   
  4      5   Bam Adebayo   C  21  MIA  82  28  23.3  3.4   5.9  ...  .735  2.0   
  ..   ...           ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   ...  ...   
  729  528  Tyler Zeller   C  29  MEM   4   1  20.5  4.0   7.0  ...  .778  2.3   
  730  529    Ante Žižić   C  22  CLE  59  25  18.3  3.1   5.6  ...  .705  1.8   
  731  530   Ivica Zubac   C  21  TOT  59  37  17.6  3.6   6.4  ...  .802  1.9   
  732  530   Ivica Zubac   C  21  LAL  33  12  15.6  3.4   5.8  ...  .864  1.6   
  733  530   Ivica Zubac   C  21  LAC  26  25  20.2  3.8   7.2  ...  .733  2.3   
  
       DRB  T

In [45]:
nba = nba_tables[0]
nba.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9


### Complex example

In [46]:
import requests

html_url = "https://en.wikipedia.org/wiki/The_Simpsons"

r = requests.get(html_url)

wiki_tables = pd.read_html(r.text, header=0)
len(wiki_tables)

  wiki_tables = pd.read_html(r.text, header=0)


47

In [47]:
simpsons = wiki_tables[2]
simpsons.head()

Unnamed: 0,Season,Season.1,No. of episodes,Originally aired,Originally aired.1,Originally aired.2,Viewership,Viewership.1,Viewership.2
0,Season,Season,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Most watched episode,Most watched episode
1,Season,Season,No. of episodes,Season premiere,Season finale,Time slot (ET),Avg. viewers (in millions),Viewers (millions),Episode title
2,1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[167],"""Life on the Fast Lane"""
3,2,1990–91,22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6[168],"""Bart Gets an 'F'"""
4,3,1991–92,24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5[169],"""Colonel Homer"""


In [48]:
simpsons.drop([0, 1], inplace=True) # dropping the rows 0 and 1
simpsons.set_index('Season', inplace=True) # Setting 'Season' as index

####  Which season has the lowest number of episodes?

In [49]:
simpsons['No. of episodes'].unique()

array(['13', '22', '24', '25', '23', '21', '20', '18[214]'], dtype=object)

or it could be done like:

In [52]:
min_season = simpsons['No. of episodes'].min()

min_season

'13'

In [53]:
simpsons.loc[simpsons['No. of episodes'] == min_season] # getting the row of the smallest season

Unnamed: 0_level_0,Season.1,No. of episodes,Originally aired,Originally aired.1,Originally aired.2,Viewership,Viewership.1,Viewership.2
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5[167],"""Life on the Fast Lane"""
