# Web Scraping NHL 1990s 

In [1]:
# Import packages to do web scraping
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# Save url from web to scrap in a variable
url = 'https://www.scrapethissite.com/pages/forms/?per_page=100'

In [3]:
# Use of requests package to call the web page
response = requests.get(url)
response

<Response [200]>

In [4]:
%%capture
# Use of BeautifulSoup package to retrieve raw HTML info 
parse_html = BeautifulSoup(response.text, 'html')
parse_html

In [5]:
%%capture
# Use .prettify() method to add hierarchy, which makes it easier to read
print(parse_html.prettify())

In [6]:
parse_html.find('table')

<table class="table">
<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>
<tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            2

In [7]:
# Use of .find_all() method to pull specific info form the web (tag, attribute), will find all responses.
title = parse_html.find_all('h1')
description = parse_html.find_all('p', class_ = 'lead')
print(title)
print(description)

[<h1>
                            Hockey Teams: Forms, Searching and Pagination
                            <small>100 items</small>
</h1>]
[<p class="lead">
                            Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                        </p>]


In [8]:
# Use of .find() method to extract text with .text, will find the first response.
description_txt = parse_html.find('p', class_ = 'lead').text
print(description_txt)
print(type(description_txt))


                            Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                        
<class 'str'>


## NHL 1990s Project

In [9]:
# Pull table titles from HTML and clean it with a list comprehension.
table_titles_html = parse_html.find_all('th')
table_titles = [title.text.strip() for title in table_titles_html]
table_titles

['Team Name',
 'Year',
 'Wins',
 'Losses',
 'OT Losses',
 'Win %',
 'Goals For (GF)',
 'Goals Against (GA)',
 '+ / -']

In [10]:
# Create a DataFrame and add the headers.
df = pd.DataFrame(columns = table_titles)
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -


In [11]:
# Pull row data from HTML, clean data with a list comprehension and populate the df using .loc[] with loop indexes.
table = parse_html.find_all('tr')
for row in table[1:]:
    row_data = row.find_all('td')
    individual_data = [data.text.strip() for data in row_data]
    lenght = len(df)
    df.loc[lenght] = individual_data
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
...,...,...,...,...,...,...,...,...,...
95,Buffalo Sabres,1994,22,19,,0.458,130,119,11
96,Calgary Flames,1994,24,17,,0.5,163,135,28
97,Chicago Blackhawks,1994,24,19,,0.5,156,115,41
98,Dallas Stars,1994,17,23,,0.354,136,135,1


In [12]:
# Extract only data from 1990
df_NHL_1990 = df[df['Year'] == '1990']
df_NHL_1990

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
5,Edmonton Oilers,1990,37,37,,0.463,272,272,0
6,Hartford Whalers,1990,31,38,,0.388,238,276,-38
7,Los Angeles Kings,1990,46,24,,0.575,340,254,86
8,Minnesota North Stars,1990,27,39,,0.338,256,266,-10
9,Montreal Canadiens,1990,39,30,,0.487,273,249,24


In [13]:
# Create a Function to extract df from year sort by best team.
def nhl_1990_1993(year=1990):
    df_year = df[df['Year'] == year]
    df_year = df_year.set_index('Team Name').sort_values('Wins', ascending = False)
    return df_year


In [14]:
nhl_1990_1994('1993')


NameError: name 'nhl_1990_1994' is not defined

In [None]:
#Export NHL 1993 df to CSV file
nhl_1990_1994('1993').to_csv(r'C:\Users\Gerardo\Downloads\NHL_1993_standings.csv')