# Web Scraping NHL 1990s 

In [72]:
# Import packages to do web scraping
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [73]:
# Save url from web to scrap in a variable
url = 'https://www.scrapethissite.com/pages/forms/?per_page=100'

In [74]:
# Use of requests package to call the web page
response = requests.get(url)
response

<Response [200]>

In [75]:
# Use of BeautifulSoup package to retrieve raw HTML info 
parse_html = BeautifulSoup(response.text, 'html')
parse_html

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robot

In [76]:
# Use .prettify() method to add hierarchy, which makes it easier to read
print(parse_html.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
  <meta con

In [77]:
parse_html.find('table')

<table class="table">
<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>
<tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            2

In [78]:
# Use of .find_all() method to pull specific info form the web (tag, attribute), will find all responses.
title = parse_html.find_all('h1')
description = parse_html.find_all('p', class_ = 'lead')
print(title)
print(description)

[<h1>
                            Hockey Teams: Forms, Searching and Pagination
                            <small>100 items</small>
</h1>]
[<p class="lead">
                            Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                        </p>]


In [79]:
# Use of .find() method to extract text with .text, will find the first response.
description_txt = parse_html.find('p', class_ = 'lead').text
print(description_txt)
print(type(description_txt))


                            Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                            Take a look at how pagination and search elements change the URL as your browse. Build a web scraper that can conduct searches and paginate through the results.
                        
<class 'str'>


## NHL 1990s Project

In [80]:
# Pull table titles from HTML and clean it with a list comprehension.
table_titles_html = parse_html.find_all('th')
table_titles = [title.text.strip() for title in table_titles_html]
table_titles

['Team Name',
 'Year',
 'Wins',
 'Losses',
 'OT Losses',
 'Win %',
 'Goals For (GF)',
 'Goals Against (GA)',
 '+ / -']

In [81]:
# Create a DataFrame and add the headers.
df = pd.DataFrame(columns = table_titles)
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -


In [82]:
# Pull row data from HTML, clean data with a list comprehension and populate the df using .loc[] with loop indexes.
table = parse_html.find_all('tr')
for row in table[1:]:
    row_data = row.find_all('td')
    individual_data = [data.text.strip() for data in row_data]
    lenght = len(df)
    df.loc[lenght] = individual_data
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
...,...,...,...,...,...,...,...,...,...
95,Buffalo Sabres,1994,22,19,,0.458,130,119,11
96,Calgary Flames,1994,24,17,,0.5,163,135,28
97,Chicago Blackhawks,1994,24,19,,0.5,156,115,41
98,Dallas Stars,1994,17,23,,0.354,136,135,1


In [83]:
# Extract only data from 1990
df_NHL_1990 = df[df['Year'] == '1990']
df_NHL_1990

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Boston Bruins,1990,44,24,,0.55,299,264,35
1,Buffalo Sabres,1990,31,30,,0.388,292,278,14
2,Calgary Flames,1990,46,26,,0.575,344,263,81
3,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
4,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
5,Edmonton Oilers,1990,37,37,,0.463,272,272,0
6,Hartford Whalers,1990,31,38,,0.388,238,276,-38
7,Los Angeles Kings,1990,46,24,,0.575,340,254,86
8,Minnesota North Stars,1990,27,39,,0.338,256,266,-10
9,Montreal Canadiens,1990,39,30,,0.487,273,249,24


In [93]:
# Create a Function to extract df from year sort by best team.
def nhl_1990_1993(year=1990):
    df_year = df[df['Year'] == year]
    df_year = df_year.set_index('Team Name').sort_values('Wins', ascending = False)
    return df_year


In [94]:
nhl_1990_1994('1993')


Unnamed: 0_level_0,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
Team Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
New York Rangers,1993,52,24,,0.619,299,231,68
New Jersey Devils,1993,47,25,,0.56,306,220,86
Detroit Red Wings,1993,46,30,,0.548,356,275,81
Pittsburgh Penguins,1993,44,27,,0.524,299,285,14
Toronto Maple Leafs,1993,43,29,,0.512,280,243,37
Buffalo Sabres,1993,43,32,,0.512,282,218,64
Dallas Stars,1993,42,29,,0.5,286,265,21
Calgary Flames,1993,42,29,,0.5,302,256,46
Boston Bruins,1993,42,29,,0.5,289,252,37
Vancouver Canucks,1993,41,40,,0.488,279,276,3


In [95]:
#Export NHL 1993 df to CSV file
nhl_1990_1994('1993').to_csv(r'C:\Users\Gerardo\Downloads\NHL_1993_standings.csv')