<a href="https://colab.research.google.com/github/Justo-sys/About-Me--/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraping Project

This project demonstrates web scraping using Python to collect structured data from a live website using Requests, BeautifulSoup, and Pandas


### Importing libraries

In [35]:
#Importing libraries that I will need for webscraping
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Getting the webpage

In [36]:
#set URL of website in a variable
url = 'https://www.scrapethissite.com/pages/forms/'
page = requests.get(url)

# Check if request was successful
page.status_code

200

Parsing the HTML

In [37]:
#Use BeautifulSoup to extract the HTML content
soup = BeautifulSoup(page.text, 'html.parser')
print(soup)


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robo

### Finding the table

In [38]:
#Extract the table with the Hockey Scores
hockey_table = soup.find('table', class_='table')
print(hockey_table)


<table class="table">
<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>
<tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            2

### Extracting column names

In [39]:
#Extract the column headings
table_titles = hockey_table.find_all('th')
hockey_table_title = [title.text.strip() for title in table_titles]
print(hockey_table_title)

['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+ / -']


### Creating an empty DataFrame

In [40]:
#Save the column headings onto a Pandas DataFrame
df = pd.DataFrame(columns=hockey_table_title)
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -


### Extracting row data and adding them to the DataFrame

In [45]:
#Extract the data row by row. First get all rows, then loop through each while stripping and saving data into the DataFrame
table_data = hockey_table.find_all('tr')
for row in table_data[1:]:
  raw_data = row.find_all('td')
  each_raw_data = [data.text.strip() for data in raw_data]
  print(each_raw_data)
  #saving each row data as it is generated into the pandas data frame
  length = len(df)
  df.loc[length] = each_raw_data

['Boston Bruins', '1990', '44', '24', '', '0.55', '299', '264', '35']
['Buffalo Sabres', '1990', '31', '30', '', '0.388', '292', '278', '14']
['Calgary Flames', '1990', '46', '26', '', '0.575', '344', '263', '81']
['Chicago Blackhawks', '1990', '49', '23', '', '0.613', '284', '211', '73']
['Detroit Red Wings', '1990', '34', '38', '', '0.425', '273', '298', '-25']
['Edmonton Oilers', '1990', '37', '37', '', '0.463', '272', '272', '0']
['Hartford Whalers', '1990', '31', '38', '', '0.388', '238', '276', '-38']
['Los Angeles Kings', '1990', '46', '24', '', '0.575', '340', '254', '86']
['Minnesota North Stars', '1990', '27', '39', '', '0.338', '256', '266', '-10']
['Montreal Canadiens', '1990', '39', '30', '', '0.487', '273', '249', '24']
['New Jersey Devils', '1990', '32', '33', '', '0.4', '272', '264', '8']
['New York Islanders', '1990', '25', '45', '', '0.312', '223', '290', '-67']
['New York Rangers', '1990', '36', '31', '', '0.45', '297', '265', '32']
['Philadelphia Flyers', '1990', '3

### Viewing the DataFrame

In [46]:
#Inspect the resulting DataFrame
df

Unnamed: 0,Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -
0,Chicago Blackhawks,1991,36,29,,0.45,257,236,21
1,Boston Bruins,1990,44,24,,0.55,299,264,35
2,Buffalo Sabres,1990,31,30,,0.388,292,278,14
3,Calgary Flames,1990,46,26,,0.575,344,263,81
4,Chicago Blackhawks,1990,49,23,,0.613,284,211,73
5,Detroit Red Wings,1990,34,38,,0.425,273,298,-25
6,Edmonton Oilers,1990,37,37,,0.463,272,272,0
7,Hartford Whalers,1990,31,38,,0.388,238,276,-38
8,Los Angeles Kings,1990,46,24,,0.575,340,254,86
9,Minnesota North Stars,1990,27,39,,0.338,256,266,-10


### Saving to CSV


In [47]:
#Save to a .csv file in the current folder
df.to_csv(r'./Hockey.csv')