# Connect

In [27]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [60]:
def create_soup(url): 
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    return soup

The first thing we can do is check the status code with requests.get(url). HTTP codes range from the 1XX to 5XX. Common status codes that you have probably seen are 200, 404, and 500.

Here’s a quick overview of what each status code means:

1XX - Information
2XX - Success
3XX - Redirect
4XX - Client Error (you made an error)
5XX - Server Error (they made an error)
Generally, what you’re looking for when you perform your own requests are status codes in the 200s.

In [28]:
url = 'https://www.scrapethissite.com/pages/forms/'

if requests.get(url):
    print('Success')
else:
    print('Error')

Success


In [61]:
soup = create_soup(url)

# Table scrapping

In [62]:
# find a table
table = soup.find('table')
print(table)

<table class="table">
<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>
<tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            2

In [43]:
# find headers
table_columns = table.find_all('th')
table_columns

[<th>
                             Team Name
                         </th>,
 <th>
                             Year
                         </th>,
 <th>
                             Wins
                         </th>,
 <th>
                             Losses
                         </th>,
 <th>
                             OT Losses
                         </th>,
 <th>
                             Win %
                         </th>,
 <th>
                             Goals For (GF)
                         </th>,
 <th>
                             Goals Against (GA)
                         </th>,
 <th>
                             + / -
                         </th>]

In [46]:
# list comrehension and cleaning with .strip()
table_columns_list = [columns.text.strip() for columns in table_columns]
print(table_columns_list)

['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+ / -']


In [47]:
# pandas dataframe
df = pd.DataFrame(columns = table_columns_list)
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -


In [51]:
# find rows
column_data = table.find_all('tr')
print(column_data)

[<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>, <tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            24
                  

In [55]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    length = len(df)
    df.loc[length] = individual_row_data

In [59]:
df.describe()

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
count,25,25,25,25,25.0,25.0,25,25,25
unique,21,2,18,17,1.0,18.0,23,23,23
top,Boston Bruins,1990,31,37,,0.388,273,264,0
freq,2,21,4,4,25.0,4.0,2,2,2
