# Software Coaching for Python
# Week 2: Advanced Web Scraping

Instructor: Kang-Pyo Lee 

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')
path = "/content/gdrive/Shareddrives/Software_Coaching_Fall_2021"
my_folder = "KHU"     # *** REPLACE WITH YOUR FOLDER NAME ***
outcome_folder = f"{path}/{my_folder}/outcome"

Mounted at /content/gdrive


In [2]:
import requests
from bs4 import BeautifulSoup

## Handle pagination

In [None]:
urls = ["https://fivethirtyeight.com/features/"] #urls라는 변수에다가 우리가 사용할 url 추가

for i in range(2, 101):     # The range(2, 101) generates a list of integers from 2 to 100.
    url = f"https://fivethirtyeight.com/features/page/{i}/" #f가 무슨 의미인지!
    urls.append(url)
    
urls

Try to get all of the URLs of the target webpages first before trying to get the contents from those webpages. At this point, it is important to find a rule for creating the URLs. 

In [None]:
for url in urls:
    print(url)              # Do whatever you want with each web page.

## Write & read an HTML file

In [5]:
url = "https://fivethirtyeight.com/features/why-was-the-national-polling-environment-so-off-in-2020/"
r = requests.get(url) #변수 r안에다가 url 저장

In [6]:
url[len("https://fivethirtyeight.com/features/"):-1] # 계속 공통되는 부분의 길이부터 시작해서, -1은 가장 마지막에 있는'/' 제거

'why-was-the-national-polling-environment-so-off-in-2020'

In [7]:
file_name = url[len("https://fivethirtyeight.com/features/"):-1] + ".html"
file_name

'why-was-the-national-polling-environment-so-off-in-2020.html'

In [8]:
with open(f"{outcome_folder}/{file_name}", "w+b") as fw: #이미지 혹은 html 파일을 저장할때는 binary 형태가 꼭 필요
    fw.write(r.content)

In [9]:
with open(f"{outcome_folder}/{file_name}", "r+b") as fr:
    soup = BeautifulSoup(fr.read(), "html.parser") #read가 파일의 콘텐츠를 한꺼번에 다 읽어오는 것
    
    print(soup.title.text)          # Do whatever you want with the saved web page.

Why Was The National Polling Environment So Off In 2020? | FiveThirtyEight


## Automate the process of saving all articles on the Features list

In [None]:
urls = ["https://fivethirtyeight.com/features/"]

for i in range(2, 10):     # The range(2, 10) generates a list of integers from 2 to 9.
    url = f"https://fivethirtyeight.com/features/page/{i}/"
    urls.append(url)

urls

In [11]:
import os

if not os.path.isdir(f"{outcome_folder}/HTMLs"): #HTMLs라는 폴더를 outcome 내부에 생성
    os.mkdir(f"{outcome_folder}/HTMLs")

In [12]:
import time     # Necessary for the sleep function.

In [None]:
# 아래는 각 url을 저장하는 코드인데, 이렇게 저장하면 사이트가 바뀌더라도 우리는 저장한 시점의 사이트에서 원하는 정보를 가져올 수 있다.
for url in urls:
    print(url)  # 각각의 url을 print하고,
    
    ####################################################
    # Get the content of a page
    ####################################################
    r = requests.get(url)  # 정보를 가져온 후
    soup = BeautifulSoup(r.content, "html.parser")  # soup에 넣는다.
    
    ####################################################
    # Get the list of articles
    ####################################################
    h2_list = soup.find_all("h2", {"class": "article-title entry-title"})  # 태그 이름이 h2이고 class가 article... 인 모든 요소를 가져온다.
    
    for h2 in h2_list:
        ####################################################
        # Find the anchor tag
        ####################################################
        a = h2.find("a")
        
        ####################################################
        # Extract the title & URL of an article
        ####################################################
        title = a.text
        article_url = a["href"]
        
        ####################################################
        # Fetch the content and save it as an HTML file
        ####################################################
        print("- " + article_url + ": processing...")
        
        r2 = requests.get(article_url)
        
        # 살펴보니 모든 기사 url의 제목 앞부분이 features로 통일된 게 아니라서 경우를 나눠서 각 url에 맞게 제목을 가져올 수 있도록 한다.
        if "/features/" in article_url:
          file_name = article_url[len("https://fivethirtyeight.com/features/"):-1] + ".html"
        elif "/videos/" in article_url:
          file_name = article_url[len("https://fivethirtyeight.com/videos/"):-1] + ".html"
        elif "/methodology/" in article_url:
          file_name = article_url[len("https://fivethirtyeight.com/methodology/"):-1] + ".html"
        else:
          assert 0 == 1, "Unknown article url pattern!"  # 조건이 false이면 assertion error

        with open(f"{outcome_folder}/HTMLs/{file_name}", "w+b") as fw:
            fw.write(r2.content)
        
        print("- " + file_name + ": saved.")
        
        ####################################################
        # Sleep for a second to not overload the web site
        ####################################################
        time.sleep(1)  # 다음페이지로 넘어가기 전에 1초간 sleep
    
    print()

## Extract information from all HTML files & save it in a CSV file

In [14]:
os.listdir(f"{outcome_folder}/HTMLs")

['the-second-amendment-didnt-protect-your-right-to-own-a-gun-until-2008.html',
 'opposing-defenses-show-us-just-how-scary-derrick-henry-is.html',
 'college-football-fans-cant-stop-rushing-the-field-but-which-games-are-worth-it.html',
 'kyle-lowry-made-his-toronto-teammates-better-can-he-do-the-same-in-miami.html',
 'ohio-state-turned-its-defense-around-and-looks-like-a-contender-again.html',
 'democrats-worry-a-lot-about-policies-that-win-elections-thats-short-sighted.html',
 'what-do-you-think-abortion-access-should-look-like-in-america-we-want-to-hear-from-you.html',
 'kyler-murray-is-a-human-highlight-reel.html',
 'americans-dont-trust-their-congressional-maps-to-be-drawn-fairly-can-anything-change-that.html',
 'astros-or-braves-flip-a-coin-for-this-world-series.html',
 'can-you-guess-what-americans-think-about-the-democrats-spending-bill.html',
 'at-least-the-lions-tried.html',
 'do-you-buy-that-donald-trump-will-be-the-republican-nominee-in-2024.html',
 'could-manchin-actually-lea

In [15]:
html_files = [file for file in os.listdir(f"{outcome_folder}/HTMLs") if file.endswith(".html")]
html_files

['the-second-amendment-didnt-protect-your-right-to-own-a-gun-until-2008.html',
 'opposing-defenses-show-us-just-how-scary-derrick-henry-is.html',
 'college-football-fans-cant-stop-rushing-the-field-but-which-games-are-worth-it.html',
 'kyle-lowry-made-his-toronto-teammates-better-can-he-do-the-same-in-miami.html',
 'ohio-state-turned-its-defense-around-and-looks-like-a-contender-again.html',
 'democrats-worry-a-lot-about-policies-that-win-elections-thats-short-sighted.html',
 'what-do-you-think-abortion-access-should-look-like-in-america-we-want-to-hear-from-you.html',
 'kyler-murray-is-a-human-highlight-reel.html',
 'americans-dont-trust-their-congressional-maps-to-be-drawn-fairly-can-anything-change-that.html',
 'astros-or-braves-flip-a-coin-for-this-world-series.html',
 'can-you-guess-what-americans-think-about-the-democrats-spending-bill.html',
 'at-least-the-lions-tried.html',
 'do-you-buy-that-donald-trump-will-be-the-republican-nominee-in-2024.html',
 'could-manchin-actually-lea

In [None]:
with open(f"{outcome_folder}/html_metadata.csv", "w") as fw:
    ####################################################
    # Column names on the first row
    ####################################################
    fw.write("file_name\tarticle_title\tarticle_author\n")

    for file_name in os.listdir(f"{outcome_folder}/HTMLs"):
        if not file_name.endswith(".html"):
            continue
        
        ####################################################
        # Column values starting from the second row
        ####################################################
        with open(f"{outcome_folder}/HTMLs/{file_name}", "r+b") as fr:
            print(file_name)
            soup = BeautifulSoup(fr.read(), "html.parser")
            article_title = soup.find("h1", {"class": "article-title article-title-single entry-title"}).text.strip()
            
            ####################################################
            # No author exception handling
            ####################################################
            if soup.find("a", {"class": "author url fn"}) == None:
                article_author = ""
            else:
                article_author = soup.find("a", {"class": "author url fn"}).text
            
            ####################################################
            # Remove all possible tabs
            ####################################################
            article_title = article_title.replace("\t", "")
            article_aurthor = article_author.replace("\t", "")
                        
            fw.write(f"{file_name}\t{article_title}\t{article_author}\n")

the-second-amendment-didnt-protect-your-right-to-own-a-gun-until-2008.html
opposing-defenses-show-us-just-how-scary-derrick-henry-is.html
college-football-fans-cant-stop-rushing-the-field-but-which-games-are-worth-it.html
kyle-lowry-made-his-toronto-teammates-better-can-he-do-the-same-in-miami.html
ohio-state-turned-its-defense-around-and-looks-like-a-contender-again.html
democrats-worry-a-lot-about-policies-that-win-elections-thats-short-sighted.html
what-do-you-think-abortion-access-should-look-like-in-america-we-want-to-hear-from-you.html
kyler-murray-is-a-human-highlight-reel.html
americans-dont-trust-their-congressional-maps-to-be-drawn-fairly-can-anything-change-that.html
astros-or-braves-flip-a-coin-for-this-world-series.html
can-you-guess-what-americans-think-about-the-democrats-spending-bill.html
at-least-the-lions-tried.html
do-you-buy-that-donald-trump-will-be-the-republican-nominee-in-2024.html
could-manchin-actually-leave-the-democratic-party.html
the-red-sox-seemed-unstop