# Imports 

In [2]:
import pandas as pd
from numpy import random

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import urllib

import os
import re
import time

from typing import List, Tuple, Union, Optional

# Settings and Configuration

In [2]:
# Data Directory Paths
SEARCH_DATA_DIR = '../../data/webpage_data/search_res/'
ART_OUTPUT_DIR = '../../data/webpage_data/articles/'
ART_INFO_OUTPUT = '../../data/webpage_data/article_info.csv' # Article Info from Initial Search Scraping
ART_INFO_OUTPUT2 = '../../data/webpage_data/article_info2.csv' # Article Info from secondary pass for each day

# URL Paths
BASE = 'https://theflipside.io/'
SEARCH_URL = BASE + 'search?query=%s'
SEARCH_POST_URL = r'&USOURCE=&UMEDIUM=&UCAMPAIGN=&UCONTENT=&UTERM=&IREFERRER=https%253A%2F%2Fwww.allsides.com%2F&LREFERRER=direct&ILANDPAGE=https%253A%2F%2Fwww.theflipside.io%2Farchives%2Fabortion-bill&VISITS=8'

In [51]:
# Chrome Driver for Selenium Scraping
driver = webdriver.Chrome('../../drivers/chromedriver_win32_106/chromedriver.exe')

# Archive Page List 

Collect all links from the archive page (Only includes a few months)

In [3]:
with open('./webpage_data/archive_page.html', 'r') as w:
    page = w.read()

In [70]:
bs = BeautifulSoup(page, 'html.parser')

In [32]:
links = bs.find_all('a', {'class': 'link-block-3 w-inline-block'})

In [33]:
links = [(BASE + link.attrs['href']).replace('//', '/') for link in links]

# Scraping Search Links

Scrape articles using the month and year as the search query

In [99]:
# Randomize sleep times for scraping
MAX_SLEEP = 8
MIN_SLEEP = 3

# Creating Search Strings
MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
FIRST_Y_MONS = MONTHS[-4:]
YEARS  = ['2018', '2019', '2020', '2021']
SEARCHES = [f'{month} {year}' for month in MONTHS for year in YEARS]

In [100]:
# Create list of month+year searches based on earliest article
SEARCHES = [search for search in SEARCHES if ((search.split(' ')[0] in FIRST_Y_MONS and search.split(' ')[1] == YEARS[0]) or (search.split(' ')[1] != '2018')) ]

In [3]:
def search_page(search_query: str, save_path: str) -> List[str]:
    '''
        Query theflipside with a given search query, fetch all links, and store the html source
        
        Parameters:
            -search_query
                Query string to search on theflipside
            -save_path
                Directory to store html sources in
                
        Return:
            List of article links found in query results
    '''
    
    url_query = urllib.parse.quote_plus(search_query)
    search_url = (SEARCH_URL % url_query) + SEARCH_POST_URL
    
    driver.get(search_url)
    
    src = driver.page_source.encode('utf8')
    
    save_name = save_path + f'/{search_query}.html'
    
    with open(save_name, 'wb') as sp:
        sp.write(src)
        
    links = get_links(src)
    return links

def get_links(page_src: str) -> List[links]:
    '''
        Collect links from a given search page html
        
        Parameters:
            -page_src
                Source html string for the query results page
                
        Return:
            List of links found on the query results page
    '''
    
    bs = BeautifulSoup(page_src, 'html.parser')
    links = bs.find_all('div', {'class': 'search-result-item'})
    
    links = [s.a.attrs for s in links]
    links = [link['href'] for link in links]
    links = [(BASE + link).replace('//', '/') for link in links]
    return links

In [109]:
# Index to start from in case of failure in loop
start_idx = 21

In [110]:
links = {'query': [], 'link': []}
for search_date in SEARCHES[start_idx:]:
    new_links = search_page(search_date, SEARCH_DATA_DIR)
    links['query'] += [search_date for _ in new_links]
    links['link'] += new_links
    
    print(f'Finished Scraping {search_date} and got {len(new_links)} links')
    
    time.sleep(random.uniform(MIN_SLEEP, MAX_SLEEP))

Finished Scraping Aug 2019 and got 60 links
Finished Scraping Aug 2020 and got 60 links
Finished Scraping Aug 2021 and got 60 links
Finished Scraping Sep 2018 and got 60 links
Finished Scraping Sep 2019 and got 60 links
Finished Scraping Sep 2020 and got 60 links
Finished Scraping Sep 2021 and got 60 links
Finished Scraping Oct 2018 and got 60 links
Finished Scraping Oct 2019 and got 60 links
Finished Scraping Oct 2020 and got 60 links
Finished Scraping Oct 2021 and got 60 links
Finished Scraping Nov 2018 and got 60 links
Finished Scraping Nov 2019 and got 60 links
Finished Scraping Nov 2020 and got 60 links
Finished Scraping Nov 2021 and got 60 links
Finished Scraping Dec 2018 and got 60 links
Finished Scraping Dec 2019 and got 60 links
Finished Scraping Dec 2020 and got 60 links
Finished Scraping Dec 2021 and got 60 links


In [113]:
# Fetch all article source files scraped
page_results = os.listdir(SEARCH_DATA_DIR)

In [115]:
# Refetch list of queries and corresponding links (failure during loop)
links = {'query': [], 'link': []}
for page in page_results:
    with open(SEARCH_DATA_DIR + page, 'r') as w:
        src = w.read()
        
    new_links = get_links(src)
    links['query'] += [search_date for _ in new_links]
    links['link'] += new_links

In [119]:
# Save all search queries and resulting links
search_links = pd.DataFrame(links)
search_links.to_csv('./webpage_data/search_links.csv', index = None)

# Scraping Articles 

Collect necessary information (summaries, news links, etc) from scraped articles 

In [173]:
def get_art_info(art_link: str, article_source: str) -> dict:
    '''
        Extract information from stored article file using Beautiful Soup
        
        Parameters:
            -art_link: str
                Link to the original article
            -article_source: str
                String storing html source of the article
        
        Return:
            Dictionary of extracted information
    '''
    
    bs = BeautifulSoup(article_source, 'html.parser')
    
    section = bs.find('div', {'class': 'section-3 wf-section'})
    date = section.find('h1', {'class': 'heading-16'}).text
    title = section.find('h1', {'class': 'heading-17'}).text
    
    left_sum = section.find('div', {'class': lambda e: ('paragraph-6' in e and 'left' in e) if e else False})
    right_sum = section.find('div', {'class': lambda e: ('paragraph-6' in e and 'right' in e) if e else False})
    
    left_sum = left_sum.p.text if left_sum.p else ''
    right_sum = right_sum.p.text if right_sum.p else ''
    
    news_links = [link.attrs['href'] for link in section.find_all('a') if link.attrs['href'][:8] == 'https://']
    
    return {
        'article_url': art_link,
        'date': date,
        'title': title,
        'left_sum': left_sum,
        'right_sum': right_sum,
        'news_links': str(news_links)
    }

In [175]:
# Iterate over links, extract article information, and store the source data/article information
for i, link in enumerate(search_links['link'].values):
    try:
        driver.get(link)

        src = driver.page_source

        article_info = get_art_info(link, src)

        # Save HTML
        with open(ART_OUTPUT_DIR + article_info['title'] + '.html', 'wb') as w:
            w.write(src.encode('utf8'))

        data = pd.DataFrame(article_info)
        data.to_csv(ART_INFO_OUTPUT, index = None, mode = 'a', header = None if i == 0 else True)
        
        print(f'Finished with article {article_info["title"]}, {i + 1} of {len(search_links)}')
        
    except Exception as e:
        print(f'Error at index {i}:', e)
        continue

Finished with article Taxes, 1 of 2460
Finished with article Mueller Time, 2 of 2460
Finished with article Free College, 3 of 2460
Finished with article Brexit Countdown, 4 of 2460
Finished with article Border Crisis, 5 of 2460
Finished with article Inmate Voting, 6 of 2460
Finished with article Ilhan Omar’s Comments, 7 of 2460
Finished with article NATO Turns 70, 8 of 2460
Finished with article Julian Assange Arrested, 9 of 2460
Finished with article Kirstjen Nielsen Resigns, 10 of 2460
Finished with article Joe Biden 2020, 11 of 2460
Finished with article Buttigieg Criticizes Pence, 12 of 2460
Finished with article Wisconsin and Chicago Elections, 13 of 2460
Finished with article Calls to Impeach Trump, 14 of 2460
Finished with article Bibi Wins Re-election, 15 of 2460
Finished with article Biden's Behavior Under Scrutiny, 16 of 2460
Finished with article Trump Vetoes Yemen Resolution, 17 of 2460
Finished with article Synagogue Shooting, and NYT Cartoon, 18 of 2460
Finished with arti

Finished with article Joe Biden, 151 of 2460
Finished with article Election Update, 152 of 2460
Finished with article Elon Musk, 153 of 2460
Error at index 153: 'NoneType' object has no attribute 'text'
Finished with article Adjourning Congress, 155 of 2460
Finished with article Earth Day, 156 of 2460
Finished with article French Election, 157 of 2460
Finished with article States Reopening, 158 of 2460
Finished with article Biden’s First 100 Days, 159 of 2460
Error at index 159: 'NoneType' object has no attribute 'text'
Finished with article Reopening the Economy, 161 of 2460
Finished with article Amazon Workers Unionize, 162 of 2460
Finished with article Florida and Disney, 163 of 2460
Finished with article China and Coronavirus, 164 of 2460
Error at index 164: 'NoneType' object has no attribute 'text'
Finished with article DeSantis vs. Disney, 166 of 2460
Finished with article Defense Production Act, 167 of 2460
Error at index 167: 'NoneType' object has no attribute 'text'
Finished w

Error at index 296: 'NoneType' object has no attribute 'text'
Finished with article Afghanistan and Biden, 298 of 2460
Finished with article  Vaccine Booster Shots, 299 of 2460
Finished with article Remain in Mexico Policy, 300 of 2460
Finished with article Hungary, 301 of 2460
Error at index 301: 'NoneType' object has no attribute 'text'
Error at index 302: 'NoneType' object has no attribute 'text'
Finished with article IPCC Report, 304 of 2460
Finished with article Taliban Advances, 305 of 2460
Error at index 305: 'NoneType' object has no attribute 'text'
Finished with article Eviction Moratorium, 307 of 2460
Finished with article Vaccine Mandates, 308 of 2460
Error at index 308: 'NoneType' object has no attribute 'text'
Error at index 309: 'NoneType' object has no attribute 'text'
Error at index 310: 'NoneType' object has no attribute 'text'
Finished with article USPS, 312 of 2460
Finished with article Kenosha, 313 of 2460
Finished with article Inflation, 314 of 2460
Error at index 

Finished with article France in Turmoil, 443 of 2460
Finished with article Comey and Cohen, 444 of 2460
Finished with article Looking Ahead to 2020, 445 of 2460
Error at index 445: 'NoneType' object has no attribute 'text'
Finished with article Oval Office Meeting and Looming Shutdown, 447 of 2460
Finished with article PM Theresa May Survives Confidence Vote, 448 of 2460
Finished with article Ukraine, 449 of 2460
Error at index 449: 'NoneType' object has no attribute 'text'
Finished with article Cohen Sentenced to Three Years in Prison, 451 of 2460
Finished with article Barr for AG and Nauert for UN Ambassador, 452 of 2460
Error at index 452: 'NoneType' object has no attribute 'text'
Error at index 453: 'NoneType' object has no attribute 'text'
Finished with article William Barr, 455 of 2460
Error at index 455: 'NoneType' object has no attribute 'text'
Finished with article Hunter Biden, 457 of 2460
Error at index 457: 'NoneType' object has no attribute 'text'
Finished with article Pet

Finished with article  Meng Wanzhou Arrested, 588 of 2460
Finished with article Anti-Semitism EO, 589 of 2460
Finished with article Democratic Primary Polls, 590 of 2460
Finished with article Border Wall Update, 591 of 2460
Finished with article Comey and Cohen, 592 of 2460
Finished with article Biden Fights Back, 593 of 2460
Finished with article France in Turmoil, 594 of 2460
Error at index 594: 'NoneType' object has no attribute 'text'
Error at index 595: 'NoneType' object has no attribute 'text'
Finished with article Kamala Harris Drops Out, 597 of 2460
Finished with article House Votes To Impeach, 598 of 2460
Error at index 598: 'NoneType' object has no attribute 'text'
Finished with article Oval Office Meeting and Looming Shutdown, 600 of 2460
Finished with article Bullet Train, 601 of 2460
Finished with article Andrew McCabe, 602 of 2460
Finished with article Bernie 2020, 603 of 2460
Finished with article Jussie Smollett, 604 of 2460
Finished with article INF Treaty, 605 of 2460

Finished with article Clemency, 736 of 2460
Error at index 736: 'NoneType' object has no attribute 'text'
Error at index 737: 'NoneType' object has no attribute 'text'
Error at index 738: 'NoneType' object has no attribute 'text'
Finished with article Facebook in Australia, 740 of 2460
Finished with article Marjorie Taylor Greene, 741 of 2460
Finished with article Impeachment Trial Begins, 742 of 2460
Error at index 742: 'NoneType' object has no attribute 'text'
Finished with article Freedom Convoy, 744 of 2460
Finished with article Durham Investigation, 745 of 2460
Finished with article Bernie 2020, 746 of 2460
Finished with article Rising Crime, 747 of 2460
Error at index 747: 'NoneType' object has no attribute 'text'
Error at index 748: 'NoneType' object has no attribute 'text'
Finished with article RNC Censure, 750 of 2460
Finished with article Bloomberg’s Candidacy, 751 of 2460
Finished with article Democratic Debate, 752 of 2460
Error at index 752: 'NoneType' object has no attrib

Finished with article Glenn Youngkin, 882 of 2460
Finished with article Bolton’s Book, 883 of 2460
Finished with article Affirmative Action, 884 of 2460
Finished with article Democratic Debate, 885 of 2460
Error at index 885: 'NoneType' object has no attribute 'text'
Finished with article Harry Reid, 887 of 2460
Finished with article Biden’s Speech, 888 of 2460
Finished with article Lev Parnas, 889 of 2460
Finished with article Mideast Peace Plan, 890 of 2460
Error at index 890: 'NoneType' object has no attribute 'text'
Finished with article Jan. 6 Hearings, 892 of 2460
Finished with article Texas Synagogue Attack, 893 of 2460
Finished with article Jan. 6 Commission, 894 of 2460
Finished with article Jan. 6 Committee, 895 of 2460
Finished with article Trump and DeSantis, 896 of 2460
Finished with article Biden’s Immigration Plan, 897 of 2460
Error at index 897: 'NoneType' object has no attribute 'text'
Finished with article Georgia Senate Runoffs, 899 of 2460
Finished with article Impe

Finished with article All Things Healthcare, 1031 of 2460
Finished with article Disrespect in DC, 1032 of 2460
Finished with article Budget Deal Reached, 1033 of 2460
Finished with article Equal Pay in Soccer, 1034 of 2460
Finished with article Iran Seizes British Tanker, 1035 of 2460
Finished with article Raising the Minimum Wage, 1036 of 2460
Error at index 1036: [Errno 22] Invalid argument: './webpage_data/articles/Citizenship Back in the Census?.html'
Finished with article Trump Goes to North Korea, 1038 of 2460
Finished with article Future of the Democratic Party, 1039 of 2460
Finished with article Boris Johnson Becomes UK Prime Minister, 1040 of 2460
Finished with article Supreme Court Rules on Gerrymandering and Census, 1041 of 2460
Error at index 1041: 'NoneType' object has no attribute 'text'
Finished with article Portland, 1043 of 2460
Finished with article Inflation, 1044 of 2460
Finished with article Immigration, 1045 of 2460
Error at index 1045: 'NoneType' object has no at

Error at index 1174: 'NoneType' object has no attribute 'text'
Error at index 1175: 'NoneType' object has no attribute 'text'
Finished with article Billionaires in Space, 1177 of 2460
Finished with article Senate Filibuster, 1178 of 2460
Finished with article Social Media and Misinformation, 1179 of 2460
Finished with article Tech CEOs, 1180 of 2460
Finished with article Jeff Sessions, 1181 of 2460
Finished with article Boris Johnson, 1182 of 2460
Finished with article Manchin-Schumer Deal, 1183 of 2460
Finished with article Russia-Taliban Bounties, 1184 of 2460
Error at index 1184: 'NoneType' object has no attribute 'text'
Finished with article Biden and Climate, 1186 of 2460
Finished with article Biden’s Approval Rating, 1187 of 2460
Finished with article Pelosi and Taiwan, 1188 of 2460
Error at index 1188: 'NoneType' object has no attribute 'text'
Finished with article Chinese Consulate Closed, 1190 of 2460
Error at index 1190: 'NoneType' object has no attribute 'text'
Finished with

Error at index 1318: 'NoneType' object has no attribute 'text'
Finished with article Supreme Court Rules on DACA, 1320 of 2460
Error at index 1320: 'NoneType' object has no attribute 'text'
Finished with article IRS Leak, 1322 of 2460
Error at index 1322: 'NoneType' object has no attribute 'text'
Error at index 1323: 'NoneType' object has no attribute 'text'
Error at index 1324: 'NoneType' object has no attribute 'text'
Error at index 1325: 'NoneType' object has no attribute 'text'
Finished with article Fauci’s Emails, 1327 of 2460
Finished with article Biden’s Budget, 1328 of 2460
Finished with article Juneteenth, 1329 of 2460
Error at index 1329: 'NoneType' object has no attribute 'text'
Error at index 1330: 'NoneType' object has no attribute 'text'
Error at index 1331: 'NoneType' object has no attribute 'text'
Error at index 1332: 'NoneType' object has no attribute 'text'
Finished with article Statues, 1334 of 2460
Finished with article Critical Race Theory, 1335 of 2460
Error at in

Finished with article South Carolina Primary, 1459 of 2460
Finished with article Life After Coronavirus, 1460 of 2460
Error at index 1460: 'NoneType' object has no attribute 'text'
Error at index 1461: 'NoneType' object has no attribute 'text'
Finished with article Dr. Seuss, 1463 of 2460
Error at index 1463: 'NoneType' object has no attribute 'text'
Finished with article Senate Filibuster, 1465 of 2460
Finished with article Gas Prices, 1466 of 2460
Error at index 1466: 'NoneType' object has no attribute 'text'
Finished with article Border Surge, 1468 of 2460
Finished with article Background Checks, 1469 of 2460
Finished with article DC Statehood, 1470 of 2460
Finished with article Biden’s Budget, 1471 of 2460
Error at index 1471: 'NoneType' object has no attribute 'text'
Finished with article Border Surge, 1473 of 2460
Finished with article Hunter Biden, 1474 of 2460
Error at index 1474: 'NoneType' object has no attribute 'text'
Error at index 1475: 'NoneType' object has no attribute 

Finished with article HEROES Act, 1601 of 2460
Error at index 1601: 'NoneType' object has no attribute 'text'
Error at index 1602: 'NoneType' object has no attribute 'text'
Error at index 1603: 'NoneType' object has no attribute 'text'
Error at index 1604: 'NoneType' object has no attribute 'text'
Error at index 1605: 'NoneType' object has no attribute 'text'
Finished with article Campaign Update, 1607 of 2460
Finished with article Michael Flynn, 1608 of 2460
Finished with article Gaza Ceasefire, 1609 of 2460
Finished with article Elon Musk, 1610 of 2460
Finished with article States Reopening, 1611 of 2460
Error at index 1611: 'NoneType' object has no attribute 'text'
Finished with article Primary Elections, 1613 of 2460
Finished with article Primary Elections, 1614 of 2460
Error at index 1614: 'NoneType' object has no attribute 'text'
Error at index 1615: 'NoneType' object has no attribute 'text'
Finished with article Lockdown Lawsuits, 1617 of 2460
Error at index 1617: 'NoneType' obj

Finished with article Midterm Elections, 1747 of 2460
Finished with article Sessions Resigns, 1748 of 2460
Finished with article Iran Sanctions Reinstated, 1749 of 2460
Finished with article GM Cuts Jobs, 1750 of 2460
Finished with article First Step Act, 1751 of 2460
Error at index 1751: 'NoneType' object has no attribute 'text'
Error at index 1752: 'NoneType' object has no attribute 'text'
Finished with article California Wildfires, 1754 of 2460
Finished with article Democratic Debate, 1755 of 2460
Finished with article Israeli Settlements, 1756 of 2460
Finished with article Yovanovitch Testifies, 1757 of 2460
Finished with article Election Results, 1758 of 2460
Finished with article Caravan Arrives at Border, 1759 of 2460
Finished with article Russia-Ukraine Conflict Escalates, 1760 of 2460
Finished with article Bolsonaro Wins in Brazil, 1761 of 2460
Finished with article Pete Buttigieg Rising, 1762 of 2460
Finished with article  Impeachment Hearings Continue, 1763 of 2460
Error at 

Finished with article Biden’s Foreign Policy Team, 1884 of 2460
Finished with article OSHA Vaccine Rule, 1885 of 2460
Finished with article UN Climate Summit, 1886 of 2460
Error at index 1886: 'NoneType' object has no attribute 'text'
Finished with article Defund The Police, 1888 of 2460
Error at index 1888: 'NoneType' object has no attribute 'text'
Error at index 1889: 'NoneType' object has no attribute 'find'
Finished with article Virginia Governor’s Election, 1891 of 2460
Finished with article Virginia Governor’s Race, 1892 of 2460
Finished with article UN Climate Summit, 1893 of 2460
Finished with article Biden Projected To Win Presidency, 1894 of 2460
Error at index 1894: 'NoneType' object has no attribute 'text'
Finished with article Supreme Court Gun Case, 1896 of 2460
Finished with article Supreme Court Blocks NY Religious Restrictions, 1897 of 2460
Finished with article Midterm Watch, 1898 of 2460
Finished with article Midterms Analysis, 1899 of 2460
Finished with article Cali

Finished with article Pipe Bombs Sent to High-Profile Democrats, 2022 of 2460
Finished with article US to Deploy Troops at the Border, 2023 of 2460
Finished with article Brexit Delayed Until After British Vote, 2024 of 2460
Finished with article Chicago Teachers’ Strike and Warren’s Education Plan, 2025 of 2460
Finished with article  Taiwan, 2026 of 2460
Finished with article Inflation, 2027 of 2460
Error at index 2027: 'NoneType' object has no attribute 'text'
Finished with article Inflation, 2029 of 2460
Error at index 2029: 'NoneType' object has no attribute 'text'
Finished with article Dave Chappelle, 2031 of 2460
Finished with article Debt Ceiling, 2032 of 2460
Finished with article Pandora Papers, 2033 of 2460
Error at index 2033: 'NoneType' object has no attribute 'text'
Error at index 2034: 'NoneType' object has no attribute 'text'
Error at index 2035: 'NoneType' object has no attribute 'text'
Error at index 2036: 'NoneType' object has no attribute 'text'
Finished with article 

Finished with article Dave Chappelle, 2163 of 2460
Finished with article Debt Ceiling, 2164 of 2460
Finished with article Pandora Papers, 2165 of 2460
Error at index 2165: 'NoneType' object has no attribute 'text'
Error at index 2166: 'NoneType' object has no attribute 'text'
Error at index 2167: 'NoneType' object has no attribute 'text'
Error at index 2168: 'NoneType' object has no attribute 'text'
Finished with article Inflation, 2170 of 2460
Finished with article Kyrsten Sinema, 2171 of 2460
Finished with article Billionaire Tax, 2172 of 2460
Finished with article Vaccine Mandates, 2173 of 2460
Error at index 2173: 'NoneType' object has no attribute 'text'
Finished with article Biden’s Town Hall, 2175 of 2460
Finished with article China Tests Missile, 2176 of 2460
Finished with article Biden’s New Framework, 2177 of 2460
Finished with article Supreme Court Commission, 2178 of 2460
Finished with article The Facebook Papers, 2179 of 2460
Finished with article Supreme Court Term, 2180 

Finished with article Tech Still Under Fire, 2304 of 2460
Finished with article Kavanaugh’s Senate Hearing Begins, 2305 of 2460
Finished with article New Allegations Against Kavanaugh, 2306 of 2460
Finished with article NYT’s Report About Rosenstein, 2307 of 2460
Finished with article North Korea’s Military Parade, 2308 of 2460
Error at index 2308: [Errno 2] No such file or directory: './webpage_data/articles/Trump Targets ICC/PLO.html'
Finished with article FDA to Ban Flavored E-cigarettes, 2310 of 2460
Finished with article Climate Strike and UN Climate Summit, 2311 of 2460
Error at index 2311: 'NoneType' object has no attribute 'text'
Finished with article Proposal for New Immigration Rule, 2313 of 2460
Finished with article New Tariffs on Chinese Imports, 2314 of 2460
Finished with article UK Supreme Court Rules Against Boris Johnson, 2315 of 2460
Finished with article Midterms in Light of Kavanaugh, 2316 of 2460
Finished with article Obama Back on Campaign Trail, 2317 of 2460
Fini

Finished with article Chile’s Constitution, 2443 of 2460
Finished with article Liz Truss, 2444 of 2460
Error at index 2444: 'NoneType' object has no attribute 'text'
Finished with article Bahrain-Israel Deal, 2446 of 2460
Finished with article UK Economic Plan, 2447 of 2460
Error at index 2447: 'NoneType' object has no attribute 'text'
Error at index 2448: 'NoneType' object has no attribute 'text'
Error at index 2449: 'NoneType' object has no attribute 'text'
Error at index 2450: 'NoneType' object has no attribute 'text'
Finished with article Mail-in Voting, 2452 of 2460
Error at index 2452: 'NoneType' object has no attribute 'text'
Finished with article Amy Coney Barrett, 2454 of 2460
Finished with article The Latino Vote, 2455 of 2460
Error at index 2455: 'NoneType' object has no attribute 'text'
Finished with article  Woodward’s Trump Interviews, 2457 of 2460
Error at index 2457: [Errno 22] Invalid argument: './webpage_data/articles/Is the Pandemic Over?.html'
Error at index 2458: '

In [202]:
# Save parsed data
data = pd.read_csv(ART_INFO_OUTPUT)

In [210]:
# Aggregate data into single rows due to saving format issues
cond_data = {col: [] for col in columns}
for i, group in data.groupby('title'):
    news = group['news_art'].values
    cond_data['news_art'].append(news)
    
    for col in group.columns:
        if col != 'news_art':
            cond_data[col].append(group[col].values[0])

In [217]:
cond_data = pd.DataFrame(cond_data)
cond_data.to_csv('./webpage_data/clean_art_data.csv', index = None)

# Filling in Date Gaps 

After the first round of scraping, some dates were not present in any search results. Searches are performed by specific date as an attempt to fill gaps 

In [46]:
def format_date_search(date) -> str:
    '''
        Format date string for search mimicking FlipSide format
        
        Parameters:
            -date: datetime
                datetime object for date to search
            
        Return:
            Reformatted date string in the form Mon DD, YYYY
    '''
    
    date_str = date.strftime('%b %d, %Y')
    return date_str

In [9]:
MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [5]:
cond_data = pd.read_csv('../../data/webpage_data/clean_art_data.csv')

In [7]:
# Infer and convert date strings to datetime
cond_data['date'] = pd.to_datetime(cond_data['date'])

In [13]:
dates_scraped = cond_data.sort_values('date', ascending = True)['date'].values

In [14]:
# Create a list of all dates expected to check for missing queries
all_dates = pd.date_range(dates_scraped[0], dates_scraped[-1], freq = 'd')

In [26]:
missing_dates = [date for date in all_dates if date not in dates_scraped]

In [52]:
# Rescrape for missing dates using the search page
links = {'query': [], 'link': []}
for date in missing_dates:
    search_date = format_date_search(date)
    
    new_links = search_page(search_date, SEARCH_DATA_DIR)
    links['query'] += [search_date for _ in new_links]
    links['link']  += new_links
    
    print(f'Finished Scraping {search_date} and got {len(new_links)} links')

Finished Scraping Sep 07, 2018 and got 60 links
Finished Scraping Sep 08, 2018 and got 60 links
Finished Scraping Sep 09, 2018 and got 60 links
Finished Scraping Sep 15, 2018 and got 60 links
Finished Scraping Sep 16, 2018 and got 60 links
Finished Scraping Sep 20, 2018 and got 60 links
Finished Scraping Sep 22, 2018 and got 60 links
Finished Scraping Sep 23, 2018 and got 60 links
Finished Scraping Sep 29, 2018 and got 60 links
Finished Scraping Sep 30, 2018 and got 60 links
Finished Scraping Oct 02, 2018 and got 60 links
Finished Scraping Oct 06, 2018 and got 60 links
Finished Scraping Oct 07, 2018 and got 60 links
Finished Scraping Oct 09, 2018 and got 60 links
Finished Scraping Oct 10, 2018 and got 60 links
Finished Scraping Oct 11, 2018 and got 60 links
Finished Scraping Oct 12, 2018 and got 60 links
Finished Scraping Oct 13, 2018 and got 60 links
Finished Scraping Oct 14, 2018 and got 60 links
Finished Scraping Oct 20, 2018 and got 60 links
Finished Scraping Oct 21, 2018 and got 6

Finished Scraping Sep 21, 2019 and got 60 links
Finished Scraping Sep 22, 2019 and got 60 links
Finished Scraping Sep 28, 2019 and got 60 links
Finished Scraping Sep 29, 2019 and got 60 links
Finished Scraping Oct 01, 2019 and got 60 links
Finished Scraping Oct 05, 2019 and got 60 links
Finished Scraping Oct 06, 2019 and got 60 links
Finished Scraping Oct 08, 2019 and got 60 links
Finished Scraping Oct 09, 2019 and got 60 links
Finished Scraping Oct 10, 2019 and got 60 links
Finished Scraping Oct 11, 2019 and got 60 links
Finished Scraping Oct 12, 2019 and got 60 links
Finished Scraping Oct 13, 2019 and got 60 links
Finished Scraping Oct 14, 2019 and got 60 links
Finished Scraping Oct 15, 2019 and got 60 links
Finished Scraping Oct 17, 2019 and got 60 links
Finished Scraping Oct 18, 2019 and got 60 links
Finished Scraping Oct 19, 2019 and got 60 links
Finished Scraping Oct 20, 2019 and got 60 links
Finished Scraping Oct 24, 2019 and got 60 links
Finished Scraping Oct 25, 2019 and got 6

Finished Scraping Jul 11, 2020 and got 60 links
Finished Scraping Jul 12, 2020 and got 60 links
Finished Scraping Jul 17, 2020 and got 60 links
Finished Scraping Jul 18, 2020 and got 60 links
Finished Scraping Jul 19, 2020 and got 60 links
Finished Scraping Jul 22, 2020 and got 60 links
Finished Scraping Jul 23, 2020 and got 60 links
Finished Scraping Jul 25, 2020 and got 60 links
Finished Scraping Jul 26, 2020 and got 60 links
Finished Scraping Jul 27, 2020 and got 60 links
Finished Scraping Jul 31, 2020 and got 60 links
Finished Scraping Aug 01, 2020 and got 60 links
Finished Scraping Aug 02, 2020 and got 60 links
Finished Scraping Aug 05, 2020 and got 60 links
Finished Scraping Aug 07, 2020 and got 60 links
Finished Scraping Aug 08, 2020 and got 60 links
Finished Scraping Aug 09, 2020 and got 60 links
Finished Scraping Aug 11, 2020 and got 60 links
Finished Scraping Aug 14, 2020 and got 60 links
Finished Scraping Aug 15, 2020 and got 60 links
Finished Scraping Aug 16, 2020 and got 6

Finished Scraping Apr 27, 2021 and got 60 links
Finished Scraping Apr 30, 2021 and got 60 links
Finished Scraping May 01, 2021 and got 60 links
Finished Scraping May 02, 2021 and got 60 links
Finished Scraping May 05, 2021 and got 60 links
Finished Scraping May 08, 2021 and got 60 links
Finished Scraping May 09, 2021 and got 60 links
Finished Scraping May 10, 2021 and got 60 links
Finished Scraping May 11, 2021 and got 60 links
Finished Scraping May 13, 2021 and got 60 links
Finished Scraping May 14, 2021 and got 60 links
Finished Scraping May 15, 2021 and got 60 links
Finished Scraping May 16, 2021 and got 60 links
Finished Scraping May 18, 2021 and got 60 links
Finished Scraping May 20, 2021 and got 60 links
Finished Scraping May 22, 2021 and got 60 links
Finished Scraping May 23, 2021 and got 60 links
Finished Scraping May 24, 2021 and got 60 links
Finished Scraping May 25, 2021 and got 60 links
Finished Scraping May 27, 2021 and got 60 links
Finished Scraping May 29, 2021 and got 6

Finished Scraping Jan 10, 2022 and got 60 links
Finished Scraping Jan 11, 2022 and got 60 links
Finished Scraping Jan 12, 2022 and got 60 links
Finished Scraping Jan 15, 2022 and got 60 links
Finished Scraping Jan 16, 2022 and got 60 links
Finished Scraping Jan 17, 2022 and got 60 links
Finished Scraping Jan 21, 2022 and got 60 links
Finished Scraping Jan 22, 2022 and got 60 links
Finished Scraping Jan 23, 2022 and got 60 links
Finished Scraping Jan 29, 2022 and got 60 links
Finished Scraping Jan 30, 2022 and got 60 links
Finished Scraping Feb 02, 2022 and got 60 links
Finished Scraping Feb 03, 2022 and got 60 links
Finished Scraping Feb 04, 2022 and got 60 links
Finished Scraping Feb 05, 2022 and got 60 links
Finished Scraping Feb 06, 2022 and got 60 links
Finished Scraping Feb 07, 2022 and got 60 links
Finished Scraping Feb 12, 2022 and got 60 links
Finished Scraping Feb 13, 2022 and got 60 links
Finished Scraping Feb 14, 2022 and got 60 links
Finished Scraping Feb 15, 2022 and got 6

Finished Scraping Sep 20, 2022 and got 60 links
Finished Scraping Sep 21, 2022 and got 60 links
Finished Scraping Sep 22, 2022 and got 60 links
Finished Scraping Sep 24, 2022 and got 60 links
Finished Scraping Sep 25, 2022 and got 60 links
Finished Scraping Sep 26, 2022 and got 60 links
Finished Scraping Oct 01, 2022 and got 60 links
Finished Scraping Oct 02, 2022 and got 60 links
Finished Scraping Oct 03, 2022 and got 60 links
Finished Scraping Oct 08, 2022 and got 60 links
Finished Scraping Oct 09, 2022 and got 60 links
Finished Scraping Oct 10, 2022 and got 60 links
Finished Scraping Oct 11, 2022 and got 60 links
Finished Scraping Oct 12, 2022 and got 60 links
Finished Scraping Oct 14, 2022 and got 60 links
Finished Scraping Oct 15, 2022 and got 60 links
Finished Scraping Oct 16, 2022 and got 60 links
Finished Scraping Oct 18, 2022 and got 60 links
Finished Scraping Oct 19, 2022 and got 60 links
Finished Scraping Oct 20, 2022 and got 60 links
Finished Scraping Oct 22, 2022 and got 6

In [186]:
# Create dataframe of missing articles that were collected
gaps = pd.DataFrame(links)
gaps['html_loc'] = SEARCH_DATA_DIR + gaps['query'] + '.html'
fill_files = gaps[['query', 'html_loc']].drop_duplicates()

# Create list of queries still missing
queries_left = set(fill_files['query'].values.tolist())

# Clean the newly collected files
fill_files = fill_files.drop(0, axis = 0)
fill_files['html_loc'] = fill_files['html_loc'].str.replace(',', '')

In [187]:
# Extract links from new searches
new_arts = {'date': [], 'url': []}
for file in fill_files['html_loc'].values:
    with open(file, 'r', encoding ='utf8') as sp:
        html = sp.read()
    bs = BeautifulSoup(html, 'html.parser')
    miss_links = bs.find_all(lambda tag: tag.name == "a" and tag.text[-12:] in queries_left)
    dates = [link.text[-12:] for link in miss_links]
    urls  = [(BASE + link.attrs['href']).replace('//', '/') for link in miss_links]
    
    new_arts['date'] += dates
    new_arts['url'] += urls
    
    # Update the remaining queries to be run
    queries_left = queries_left.difference(set(dates))
    print(f'Reduced to {len(queries_left)} article dates remaining')
    
    if len(queries_left) == 0:
        break

Reduced to 865 article dates remaining
Reduced to 862 article dates remaining
Reduced to 858 article dates remaining
Reduced to 857 article dates remaining
Reduced to 852 article dates remaining
Reduced to 849 article dates remaining
Reduced to 847 article dates remaining
Reduced to 844 article dates remaining
Reduced to 843 article dates remaining
Reduced to 824 article dates remaining
Reduced to 823 article dates remaining
Reduced to 821 article dates remaining
Reduced to 820 article dates remaining
Reduced to 818 article dates remaining
Reduced to 817 article dates remaining
Reduced to 813 article dates remaining
Reduced to 812 article dates remaining
Reduced to 809 article dates remaining
Reduced to 807 article dates remaining
Reduced to 804 article dates remaining
Reduced to 804 article dates remaining
Reduced to 804 article dates remaining
Reduced to 801 article dates remaining
Reduced to 791 article dates remaining
Reduced to 788 article dates remaining
Reduced to 786 article da

Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 562 article dates remaining
Reduced to 561 article dates remaining
Reduced to 561 article dates remaining
Reduced to 561 article dates remaining
Reduced to 560 article dates remaining
Reduced to 559 article dates remaining
Reduced to 559 article dates remaining
Reduced to 559 article dates remaining
Reduced to 559 article dates remaining
Reduced to 558 article da

Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article dates remaining
Reduced to 529 article da

Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article da

Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article dates remaining
Reduced to 526 article da

In [190]:
new_arts_df = pd.DataFrame(new_arts)

In [211]:
# Extract information from rescraped pages
error_is = []
for i, link in enumerate(new_arts_df['url'].values):
    try:       
        driver.get(link)

        src = driver.page_source

        article_info = get_art_info(link, src)

        # Save HTML
        article_fp = (ART_OUTPUT_DIR + article_info['title'] + '.html').replace('?', '')
        with open(article_fp, 'wb') as w:
            w.write(src.encode('utf8'))

        data = pd.DataFrame(article_info, index = [i])
        data.to_csv(ART_INFO_OUTPUT2, index = None, mode = 'a', header = None if i == 0 else True)

        print(f'Finished with article {article_info["title"]}, {i + 1} of {len(new_arts_df)}')
        
    except Exception as e:
        print(f'Error at index {i}:', e)
        error_is.append(i)
        continue

Finished with article Biden’s Approval Rating, 1 of 351
Finished with article Midterm Watch, 2 of 351
Finished with article Mar-a-Lago Special Master, 3 of 351
Finished with article Dems Debate, 4 of 351
Finished with article US Withdrawing From Syria, 5 of 351
Finished with article Anonymous Op-ed in the NYT, 6 of 351
Finished with article Citizenship Back in the Census?, 7 of 351
Finished with article Ukraine, 8 of 351
Finished with article Taiwan, 9 of 351
Finished with article Pennsylvania, 10 of 351
Finished with article Ukraine, 11 of 351
Finished with article Navalny Poisoning Condemned, 12 of 351
Finished with article Queen Elizabeth II, 13 of 351
Finished with article NBA Kowtows to China, 14 of 351
Finished with article Amazon’s HQ2, 15 of 351
Finished with article Who Investigates the Investigators?, 16 of 351
Finished with article Special Edition: Ask Us Anything, 17 of 351
Finished with article Mikhail Gorbachev, 18 of 351
Finished with article General Milley, 19 of 351
Fi

Finished with article John Lewis, 163 of 351
Finished with article Apple Daily, 164 of 351
Finished with article Will Smith, 165 of 351
Finished with article Ukraine, 166 of 351
Finished with article Immigration, 167 of 351
Finished with article Ukraine, 168 of 351
Finished with article Reopening Schools, 169 of 351
Finished with article Democratic Debate, 170 of 351
Finished with article Alexei Navalny, 171 of 351
Finished with article Andrew Cuomo, 172 of 351
Finished with article Democratic Debate, 173 of 351
Finished with article Beijing Olympics, 174 of 351
Finished with article Governors Under Fire, 175 of 351
Finished with article Afghanistan’s Assets, 176 of 351
Finished with article Romney’s Child Payments Plan, 177 of 351
Finished with article San Francisco School Board, 178 of 351
Finished with article Russia Invades Ukraine, 179 of 351
Finished with article State of the Union, 180 of 351
Finished with article Ukraine, 181 of 351
Finished with article Coronavirus, 182 of 351

Finished with article NYC Schools Closing, 315 of 351
Finished with article Trump and the Protests, 316 of 351
Finished with article Violence at the Capitol, 317 of 351
Finished with article Oscars Diversity Requirements, 318 of 351
Finished with article Bolton's Book, 319 of 351
Finished with article State of the Union, 320 of 351
Finished with article Middle East policy, 321 of 351
Finished with article Wisconsin and Michigan, 322 of 351
Finished with article Trump Campaign Update, 323 of 351
Finished with article AUKUS Alliance, 324 of 351
Finished with article Coronavirus Ruins Everything, 325 of 351
Finished with article Federal Reserve Acts, 326 of 351
Finished with article China and Russia, 327 of 351
Finished with article Afghanistan Troop Withdrawal, 328 of 351
Finished with article Amash Considers Presidential Run, 329 of 351
Finished with article COVID-19 in India, 330 of 351
Finished with article Victory Day, 331 of 351
Finished with article Federal Reserve and Jobs Report,

# Final Dataset 

Concatenating all data collected into a single file and cleaning collected meta data

In [258]:
def correct_df(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    '''
        Correct saving errors in a dataframe, moving first row to data and adding correct columns
        
        Parameters:
            -df: pd.DataFrame
                Incorrectly formatted dataframe (header missing)
            -cols: List[str]
                List of columns names in final dataframe
                
        Return:
            Modified DataFrame moving first row into the data, and setting the header columns
    '''
    
    fr = list(df.columns)
    fr = {col:val for col, val in zip(cols, fr)}
    fr_df = pd.DataFrame(fr, columns = cols, index = [0])
    df.columns = cols
    return pd.concat((fr_df, df), axis = 0)

In [256]:
# First pass articles
old_arts = pd.read_csv('../../data/webpage_data/clean_art_data.csv')
# Second pass articles
new_arts = pd.read_csv(ART_INFO_OUTPUT2)

In [257]:
cols = ['article_url', 'date', 'title', 'left_sum', 'right_sum', 'linked_arts']

In [259]:
new_arts = correct_df(new_arts, cols)

In [264]:
old_arts.columns = cols

In [268]:
all_arts = pd.concat((old_arts, new_arts), axis = 0)

In [13]:
# Filter out duplicate news links
all_arts['linked_arts'] = all_arts['linked_arts'].apply(lambda l: list(set(l)))

In [19]:
# Drop any duplicates if they exist
all_arts = all_arts.drop_duplicates(subset = 'date')

In [21]:
all_arts['title'] = all_arts['title'].str.strip()

In [22]:
all_arts.to_csv('../../data/webpage_data/full_flipside_data.csv', index = None)