# Rappler 2022 Scitech Database Maker

In [66]:
import numpy as np
import pandas as pd
import requests
import bs4
import re
import datetime
import time
import sqlite3
import db_caller
from tqdm.notebook import tqdm

## Create + Connect to SQL Database

In [67]:
conn = db_caller.connect("LR2.db")

## 2022 Rappler Section/Article Link Data

### Webscrape Section Links

In [2]:
# Request main page
proxies = {'http': 'http://206.189.157.23'}
resp = requests.get('http://www.rappler.com', proxies=proxies)
soup = bs4.BeautifulSoup(resp.text)

# Get section links
topic_li = soup.select('nav.burger-nav > div[id^="submenu-panel-"][data-depth="1"]')[0].select('li')
topics = dict([(i.text.casefold(), i.select_one('a')['href']) for i in topic_li][1:-1])

# Get subsection links + topic-subtopic dictionary
subtopic_li = [i.select('li') for i in soup.select('nav.burger-nav > div[id^="submenu-panel-"][data-depth="2"]')][1:]
topic_tree = {}
subtopics = []
for topic, links in zip(topics, subtopic_li):
    branch = []
    for link in links:
        if not bool(re.match(r'^see all', link.text.casefold())):
            subtopics.append((link.text.casefold(), link.select_one('a')['href']))
            branch.append(link.text.casefold())
    topic_tree[topic] = branch
subtopics = dict(subtopics)

### Webscrape Article Links

In [51]:
# Get articles
articles = []

for topic in tqdm(topic_tree):
    print(f'On topic page: {topic}')
    
    for subtopic in tqdm(topic_tree[topic]):
        print(f'    On subtopic page: {subtopic}')
        
        # Request subtopic page
        subtopic_page = subtopics[subtopic]
        soup = bs4.BeautifulSoup(requests.get(subtopic_page, proxies=proxies).text)

        while True:
            # Scrape info per article in subtopic page
            for i in soup.select('article'):
                article_topic = i.select_one('div > a').text.strip().casefold()
                article_title = i.select_one('h2 > a').text.strip().casefold()
                article_link = i.select_one('h2 > a')['href']
                article_date = i.select_one('time')['datetime']
                retrieval_date = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S+00:00')
                # Append article info if published in 2022
                if article_date[:4] == '2022':
                    stop_2022 = False
                    articles.append([topic, subtopic, article_title, article_topic, article_link, article_date, retrieval_date])
                else:
                    stop_2022 = True

            # If articles are no longer published in 2022, move to next subtopic
            if stop_2022:
                break
            # If there is still a next page, move to next page
            elif soup.select_one('div.pagination'):
                next_page = soup.select_one('div.pagination a')['href']
                soup = bs4.BeautifulSoup(requests.get(next_page, proxies=proxies).text)
                time.sleep(np.random.choice(np.arange(1,10)))
                print(f'        {next_page}')
            else:
                break

  0%|          | 0/14 [00:00<?, ?it/s]

On topic page: video


  0%|          | 0/12 [00:00<?, ?it/s]

    On subtopic page: hold the line
    On subtopic page: video editorial
        https://www.rappler.com/section/video/editorial/page/2/
        https://www.rappler.com/section/video/editorial/page/3/
        https://www.rappler.com/section/video/editorial/page/4/
    On subtopic page: act one
    On subtopic page: basagan ng trip
    On subtopic page: daily wrap
        https://www.rappler.com/section/video/daily-wrap/page/2/
        https://www.rappler.com/section/video/daily-wrap/page/3/
        https://www.rappler.com/section/video/daily-wrap/page/4/
        https://www.rappler.com/section/video/daily-wrap/page/5/
        https://www.rappler.com/section/video/daily-wrap/page/6/
        https://www.rappler.com/section/video/daily-wrap/page/7/
        https://www.rappler.com/section/video/daily-wrap/page/8/
        https://www.rappler.com/section/video/daily-wrap/page/9/
        https://www.rappler.com/section/video/daily-wrap/page/10/
        https://www.rappler.com/section/video/d

  0%|          | 0/14 [00:00<?, ?it/s]

    On subtopic page: communicart
        https://www.rappler.com/section/brandrap/communicart/page/2/
    On subtopic page: get ready with me
    On subtopic page: announcements
        https://www.rappler.com/section/brandrap/announcements/page/2/
        https://www.rappler.com/section/brandrap/announcements/page/3/
        https://www.rappler.com/section/brandrap/announcements/page/4/
        https://www.rappler.com/section/brandrap/announcements/page/5/
        https://www.rappler.com/section/brandrap/announcements/page/6/
        https://www.rappler.com/section/brandrap/announcements/page/7/
        https://www.rappler.com/section/brandrap/announcements/page/8/
        https://www.rappler.com/section/brandrap/announcements/page/9/
        https://www.rappler.com/section/brandrap/announcements/page/10/
        https://www.rappler.com/section/brandrap/announcements/page/11/
        https://www.rappler.com/section/brandrap/announcements/page/12/
    On subtopic page: beyond the buy


  0%|          | 0/10 [00:00<?, ?it/s]

    On subtopic page: philippine elections
        https://www.rappler.com/section/nation/elections/page/2/
        https://www.rappler.com/section/nation/elections/page/3/
        https://www.rappler.com/section/nation/elections/page/4/
        https://www.rappler.com/section/nation/elections/page/5/
        https://www.rappler.com/section/nation/elections/page/6/
        https://www.rappler.com/section/nation/elections/page/7/
        https://www.rappler.com/section/nation/elections/page/8/
        https://www.rappler.com/section/nation/elections/page/9/
        https://www.rappler.com/section/nation/elections/page/10/
        https://www.rappler.com/section/nation/elections/page/11/
        https://www.rappler.com/section/nation/elections/page/12/
        https://www.rappler.com/section/nation/elections/page/13/
        https://www.rappler.com/section/nation/elections/page/14/
        https://www.rappler.com/section/nation/elections/page/15/
        https://www.rappler.com/section/n

        https://www.rappler.com/section/nation/elections/page/126/
        https://www.rappler.com/section/nation/elections/page/127/
        https://www.rappler.com/section/nation/elections/page/128/
        https://www.rappler.com/section/nation/elections/page/129/
        https://www.rappler.com/section/nation/elections/page/130/
        https://www.rappler.com/section/nation/elections/page/131/
        https://www.rappler.com/section/nation/elections/page/132/
        https://www.rappler.com/section/nation/elections/page/133/
        https://www.rappler.com/section/nation/elections/page/134/
        https://www.rappler.com/section/nation/elections/page/135/
        https://www.rappler.com/section/nation/elections/page/136/
        https://www.rappler.com/section/nation/elections/page/137/
        https://www.rappler.com/section/nation/elections/page/138/
        https://www.rappler.com/section/nation/elections/page/139/
        https://www.rappler.com/section/nation/elections/page/

        https://www.rappler.com/section/nation/mindanao/page/22/
        https://www.rappler.com/section/nation/mindanao/page/23/
        https://www.rappler.com/section/nation/mindanao/page/24/
        https://www.rappler.com/section/nation/mindanao/page/25/
        https://www.rappler.com/section/nation/mindanao/page/26/
        https://www.rappler.com/section/nation/mindanao/page/27/
        https://www.rappler.com/section/nation/mindanao/page/28/
        https://www.rappler.com/section/nation/mindanao/page/29/
        https://www.rappler.com/section/nation/mindanao/page/30/
        https://www.rappler.com/section/nation/mindanao/page/31/
        https://www.rappler.com/section/nation/mindanao/page/32/
        https://www.rappler.com/section/nation/mindanao/page/33/
        https://www.rappler.com/section/nation/mindanao/page/34/
        https://www.rappler.com/section/nation/mindanao/page/35/
        https://www.rappler.com/section/nation/mindanao/page/36/
        https://www.rappl

        https://www.rappler.com/section/nation/national-news/page/71/
        https://www.rappler.com/section/nation/national-news/page/72/
        https://www.rappler.com/section/nation/national-news/page/73/
        https://www.rappler.com/section/nation/national-news/page/74/
        https://www.rappler.com/section/nation/national-news/page/75/
        https://www.rappler.com/section/nation/national-news/page/76/
        https://www.rappler.com/section/nation/national-news/page/77/
        https://www.rappler.com/section/nation/national-news/page/78/
        https://www.rappler.com/section/nation/national-news/page/79/
        https://www.rappler.com/section/nation/national-news/page/80/
        https://www.rappler.com/section/nation/national-news/page/81/
        https://www.rappler.com/section/nation/national-news/page/82/
        https://www.rappler.com/section/nation/national-news/page/83/
        https://www.rappler.com/section/nation/national-news/page/84/
        https://www.

        https://www.rappler.com/section/nation/visayas/page/16/
        https://www.rappler.com/section/nation/visayas/page/17/
        https://www.rappler.com/section/nation/visayas/page/18/
        https://www.rappler.com/section/nation/visayas/page/19/
        https://www.rappler.com/section/nation/visayas/page/20/
        https://www.rappler.com/section/nation/visayas/page/21/
        https://www.rappler.com/section/nation/visayas/page/22/
        https://www.rappler.com/section/nation/visayas/page/23/
        https://www.rappler.com/section/nation/visayas/page/24/
        https://www.rappler.com/section/nation/visayas/page/25/
        https://www.rappler.com/section/nation/visayas/page/26/
        https://www.rappler.com/section/nation/visayas/page/27/
        https://www.rappler.com/section/nation/visayas/page/28/
        https://www.rappler.com/section/nation/visayas/page/29/
        https://www.rappler.com/section/nation/visayas/page/30/
        https://www.rappler.com/section/

  0%|          | 0/8 [00:00<?, ?it/s]

    On subtopic page: data and documents
        https://www.rappler.com/section/newsbreak/data-documents/page/2/
    On subtopic page: explainers
        https://www.rappler.com/section/newsbreak/explainers/page/2/
        https://www.rappler.com/section/newsbreak/explainers/page/3/
        https://www.rappler.com/section/newsbreak/explainers/page/4/
        https://www.rappler.com/section/newsbreak/explainers/page/5/
    On subtopic page: fact check
        https://www.rappler.com/section/newsbreak/fact-check/page/2/
        https://www.rappler.com/section/newsbreak/fact-check/page/3/
        https://www.rappler.com/section/newsbreak/fact-check/page/4/
        https://www.rappler.com/section/newsbreak/fact-check/page/5/
        https://www.rappler.com/section/newsbreak/fact-check/page/6/
        https://www.rappler.com/section/newsbreak/fact-check/page/7/
        https://www.rappler.com/section/newsbreak/fact-check/page/8/
        https://www.rappler.com/section/newsbreak/fact-check/

        https://www.rappler.com/section/newsbreak/iq/page/15/
        https://www.rappler.com/section/newsbreak/iq/page/16/
        https://www.rappler.com/section/newsbreak/iq/page/17/
        https://www.rappler.com/section/newsbreak/iq/page/18/
        https://www.rappler.com/section/newsbreak/iq/page/19/
On topic page: sports


  0%|          | 0/9 [00:00<?, ?it/s]

    On subtopic page: boxing
        https://www.rappler.com/section/sports/boxing/page/2/
        https://www.rappler.com/section/sports/boxing/page/3/
        https://www.rappler.com/section/sports/boxing/page/4/
        https://www.rappler.com/section/sports/boxing/page/5/
        https://www.rappler.com/section/sports/boxing/page/6/
        https://www.rappler.com/section/sports/boxing/page/7/
        https://www.rappler.com/section/sports/boxing/page/8/
        https://www.rappler.com/section/sports/boxing/page/9/
        https://www.rappler.com/section/sports/boxing/page/10/
    On subtopic page: fiba
        https://www.rappler.com/section/sports/fiba/page/2/
        https://www.rappler.com/section/sports/fiba/page/3/
        https://www.rappler.com/section/sports/fiba/page/4/
    On subtopic page: football
        https://www.rappler.com/section/sports/football/page/2/
        https://www.rappler.com/section/sports/football/page/3/
        https://www.rappler.com/section/sports

        https://www.rappler.com/section/sports/nba/page/62/
        https://www.rappler.com/section/sports/nba/page/63/
        https://www.rappler.com/section/sports/nba/page/64/
        https://www.rappler.com/section/sports/nba/page/65/
        https://www.rappler.com/section/sports/nba/page/66/
        https://www.rappler.com/section/sports/nba/page/67/
        https://www.rappler.com/section/sports/nba/page/68/
        https://www.rappler.com/section/sports/nba/page/69/
        https://www.rappler.com/section/sports/nba/page/70/
        https://www.rappler.com/section/sports/nba/page/71/
        https://www.rappler.com/section/sports/nba/page/72/
        https://www.rappler.com/section/sports/nba/page/73/
        https://www.rappler.com/section/sports/nba/page/74/
        https://www.rappler.com/section/sports/nba/page/75/
        https://www.rappler.com/section/sports/nba/page/76/
        https://www.rappler.com/section/sports/nba/page/77/
        https://www.rappler.com/section/

        https://www.rappler.com/section/sports/uaap/page/42/
        https://www.rappler.com/section/sports/uaap/page/43/
        https://www.rappler.com/section/sports/uaap/page/44/
        https://www.rappler.com/section/sports/uaap/page/45/
        https://www.rappler.com/section/sports/uaap/page/46/
        https://www.rappler.com/section/sports/uaap/page/47/
    On subtopic page: volleyball
        https://www.rappler.com/section/sports/volleyball/page/2/
        https://www.rappler.com/section/sports/volleyball/page/3/
        https://www.rappler.com/section/sports/volleyball/page/4/
        https://www.rappler.com/section/sports/volleyball/page/5/
        https://www.rappler.com/section/sports/volleyball/page/6/
        https://www.rappler.com/section/sports/volleyball/page/7/
        https://www.rappler.com/section/sports/volleyball/page/8/
        https://www.rappler.com/section/sports/volleyball/page/9/
        https://www.rappler.com/section/sports/volleyball/page/10/
      

  0%|          | 0/9 [00:00<?, ?it/s]

    On subtopic page: africa
        https://www.rappler.com/section/world/africa/page/2/
        https://www.rappler.com/section/world/africa/page/3/
        https://www.rappler.com/section/world/africa/page/4/
        https://www.rappler.com/section/world/africa/page/5/
        https://www.rappler.com/section/world/africa/page/6/
        https://www.rappler.com/section/world/africa/page/7/
    On subtopic page: asia pacific
        https://www.rappler.com/section/world/asia-pacific/page/2/
        https://www.rappler.com/section/world/asia-pacific/page/3/
        https://www.rappler.com/section/world/asia-pacific/page/4/
        https://www.rappler.com/section/world/asia-pacific/page/5/
        https://www.rappler.com/section/world/asia-pacific/page/6/
        https://www.rappler.com/section/world/asia-pacific/page/7/
        https://www.rappler.com/section/world/asia-pacific/page/8/
        https://www.rappler.com/section/world/asia-pacific/page/9/
        https://www.rappler.com/se

        https://www.rappler.com/section/world/asia-pacific/page/117/
        https://www.rappler.com/section/world/asia-pacific/page/118/
        https://www.rappler.com/section/world/asia-pacific/page/119/
        https://www.rappler.com/section/world/asia-pacific/page/120/
        https://www.rappler.com/section/world/asia-pacific/page/121/
        https://www.rappler.com/section/world/asia-pacific/page/122/
    On subtopic page: bahasa indonesia
    On subtopic page: europe
        https://www.rappler.com/section/world/europe/page/2/
        https://www.rappler.com/section/world/europe/page/3/
        https://www.rappler.com/section/world/europe/page/4/
        https://www.rappler.com/section/world/europe/page/5/
        https://www.rappler.com/section/world/europe/page/6/
        https://www.rappler.com/section/world/europe/page/7/
        https://www.rappler.com/section/world/europe/page/8/
        https://www.rappler.com/section/world/europe/page/9/
        https://www.rappler.co

        https://www.rappler.com/section/world/europe/page/127/
        https://www.rappler.com/section/world/europe/page/128/
        https://www.rappler.com/section/world/europe/page/129/
    On subtopic page: global affairs
        https://www.rappler.com/section/world/global-affairs/page/2/
        https://www.rappler.com/section/world/global-affairs/page/3/
        https://www.rappler.com/section/world/global-affairs/page/4/
        https://www.rappler.com/section/world/global-affairs/page/5/
        https://www.rappler.com/section/world/global-affairs/page/6/
        https://www.rappler.com/section/world/global-affairs/page/7/
        https://www.rappler.com/section/world/global-affairs/page/8/
        https://www.rappler.com/section/world/global-affairs/page/9/
        https://www.rappler.com/section/world/global-affairs/page/10/
        https://www.rappler.com/section/world/global-affairs/page/11/
        https://www.rappler.com/section/world/global-affairs/page/12/
        http

        https://www.rappler.com/section/world/us-canada/page/29/
        https://www.rappler.com/section/world/us-canada/page/30/
        https://www.rappler.com/section/world/us-canada/page/31/
        https://www.rappler.com/section/world/us-canada/page/32/
        https://www.rappler.com/section/world/us-canada/page/33/
        https://www.rappler.com/section/world/us-canada/page/34/
        https://www.rappler.com/section/world/us-canada/page/35/
        https://www.rappler.com/section/world/us-canada/page/36/
        https://www.rappler.com/section/world/us-canada/page/37/
        https://www.rappler.com/section/world/us-canada/page/38/
        https://www.rappler.com/section/world/us-canada/page/39/
        https://www.rappler.com/section/world/us-canada/page/40/
        https://www.rappler.com/section/world/us-canada/page/41/
        https://www.rappler.com/section/world/us-canada/page/42/
        https://www.rappler.com/section/world/us-canada/page/43/
        https://www.rappl

  0%|          | 0/7 [00:00<?, ?it/s]

    On subtopic page: new school
        https://www.rappler.com/section/voices/new-school/page/2/
        https://www.rappler.com/section/voices/new-school/page/3/
        https://www.rappler.com/section/voices/new-school/page/4/
        https://www.rappler.com/section/voices/new-school/page/5/
        https://www.rappler.com/section/voices/new-school/page/6/
    On subtopic page: editorials
        https://www.rappler.com/section/voices/editorials/page/2/
        https://www.rappler.com/section/voices/editorials/page/3/
        https://www.rappler.com/section/voices/editorials/page/4/
        https://www.rappler.com/section/voices/editorials/page/5/
        https://www.rappler.com/section/voices/editorials/page/6/
        https://www.rappler.com/section/voices/editorials/page/7/
        https://www.rappler.com/section/voices/editorials/page/8/
        https://www.rappler.com/section/voices/editorials/page/9/
    On subtopic page: imho
        https://www.rappler.com/section/voices/im

  0%|          | 0/7 [00:00<?, ?it/s]

    On subtopic page: celebrities
        https://www.rappler.com/section/entertainment/celebrities/page/2/
        https://www.rappler.com/section/entertainment/celebrities/page/3/
        https://www.rappler.com/section/entertainment/celebrities/page/4/
        https://www.rappler.com/section/entertainment/celebrities/page/5/
        https://www.rappler.com/section/entertainment/celebrities/page/6/
        https://www.rappler.com/section/entertainment/celebrities/page/7/
        https://www.rappler.com/section/entertainment/celebrities/page/8/
        https://www.rappler.com/section/entertainment/celebrities/page/9/
        https://www.rappler.com/section/entertainment/celebrities/page/10/
        https://www.rappler.com/section/entertainment/celebrities/page/11/
        https://www.rappler.com/section/entertainment/celebrities/page/12/
        https://www.rappler.com/section/entertainment/celebrities/page/13/
        https://www.rappler.com/section/entertainment/celebrities/page/14/

        https://www.rappler.com/section/entertainment/movies/page/16/
        https://www.rappler.com/section/entertainment/movies/page/17/
        https://www.rappler.com/section/entertainment/movies/page/18/
        https://www.rappler.com/section/entertainment/movies/page/19/
        https://www.rappler.com/section/entertainment/movies/page/20/
        https://www.rappler.com/section/entertainment/movies/page/21/
        https://www.rappler.com/section/entertainment/movies/page/22/
        https://www.rappler.com/section/entertainment/movies/page/23/
        https://www.rappler.com/section/entertainment/movies/page/24/
        https://www.rappler.com/section/entertainment/movies/page/25/
        https://www.rappler.com/section/entertainment/movies/page/26/
        https://www.rappler.com/section/entertainment/movies/page/27/
        https://www.rappler.com/section/entertainment/movies/page/28/
        https://www.rappler.com/section/entertainment/movies/page/29/
        https://www.

        https://www.rappler.com/section/entertainment/series/page/8/
        https://www.rappler.com/section/entertainment/series/page/9/
        https://www.rappler.com/section/entertainment/series/page/10/
        https://www.rappler.com/section/entertainment/series/page/11/
        https://www.rappler.com/section/entertainment/series/page/12/
        https://www.rappler.com/section/entertainment/series/page/13/
        https://www.rappler.com/section/entertainment/series/page/14/
        https://www.rappler.com/section/entertainment/series/page/15/
        https://www.rappler.com/section/entertainment/series/page/16/
        https://www.rappler.com/section/entertainment/series/page/17/
        https://www.rappler.com/section/entertainment/series/page/18/
        https://www.rappler.com/section/entertainment/series/page/19/
        https://www.rappler.com/section/entertainment/series/page/20/
        https://www.rappler.com/section/entertainment/series/page/21/
        https://www.ra

  0%|          | 0/7 [00:00<?, ?it/s]

    On subtopic page: consumer issues
        https://www.rappler.com/section/business/consumer-issues/page/2/
        https://www.rappler.com/section/business/consumer-issues/page/3/
        https://www.rappler.com/section/business/consumer-issues/page/4/
        https://www.rappler.com/section/business/consumer-issues/page/5/
        https://www.rappler.com/section/business/consumer-issues/page/6/
        https://www.rappler.com/section/business/consumer-issues/page/7/
        https://www.rappler.com/section/business/consumer-issues/page/8/
        https://www.rappler.com/section/business/consumer-issues/page/9/
        https://www.rappler.com/section/business/consumer-issues/page/10/
        https://www.rappler.com/section/business/consumer-issues/page/11/
        https://www.rappler.com/section/business/consumer-issues/page/12/
        https://www.rappler.com/section/business/consumer-issues/page/13/
        https://www.rappler.com/section/business/consumer-issues/page/14/
        

        https://www.rappler.com/section/business/economy/page/21/
        https://www.rappler.com/section/business/economy/page/22/
        https://www.rappler.com/section/business/economy/page/23/
        https://www.rappler.com/section/business/economy/page/24/
        https://www.rappler.com/section/business/economy/page/25/
        https://www.rappler.com/section/business/economy/page/26/
        https://www.rappler.com/section/business/economy/page/27/
        https://www.rappler.com/section/business/economy/page/28/
        https://www.rappler.com/section/business/economy/page/29/
        https://www.rappler.com/section/business/economy/page/30/
        https://www.rappler.com/section/business/economy/page/31/
        https://www.rappler.com/section/business/economy/page/32/
        https://www.rappler.com/section/business/economy/page/33/
        https://www.rappler.com/section/business/economy/page/34/
        https://www.rappler.com/section/business/economy/page/35/
        ht

        https://www.rappler.com/section/business/industries/page/8/
        https://www.rappler.com/section/business/industries/page/9/
        https://www.rappler.com/section/business/industries/page/10/
        https://www.rappler.com/section/business/industries/page/11/
        https://www.rappler.com/section/business/industries/page/12/
        https://www.rappler.com/section/business/industries/page/13/
        https://www.rappler.com/section/business/industries/page/14/
        https://www.rappler.com/section/business/industries/page/15/
        https://www.rappler.com/section/business/industries/page/16/
        https://www.rappler.com/section/business/industries/page/17/
        https://www.rappler.com/section/business/industries/page/18/
        https://www.rappler.com/section/business/industries/page/19/
        https://www.rappler.com/section/business/industries/page/20/
        https://www.rappler.com/section/business/industries/page/21/
        https://www.rappler.com/sect

  0%|          | 0/8 [00:00<?, ?it/s]

    On subtopic page: arts & culture
        https://www.rappler.com/section/life-and-style/arts-culture/page/2/
        https://www.rappler.com/section/life-and-style/arts-culture/page/3/
        https://www.rappler.com/section/life-and-style/arts-culture/page/4/
        https://www.rappler.com/section/life-and-style/arts-culture/page/5/
        https://www.rappler.com/section/life-and-style/arts-culture/page/6/
        https://www.rappler.com/section/life-and-style/arts-culture/page/7/
        https://www.rappler.com/section/life-and-style/arts-culture/page/8/
        https://www.rappler.com/section/life-and-style/arts-culture/page/9/
        https://www.rappler.com/section/life-and-style/arts-culture/page/10/
        https://www.rappler.com/section/life-and-style/arts-culture/page/11/
        https://www.rappler.com/section/life-and-style/arts-culture/page/12/
        https://www.rappler.com/section/life-and-style/arts-culture/page/13/
        https://www.rappler.com/section/life-an

  0%|          | 0/8 [00:00<?, ?it/s]

    On subtopic page: apps
        https://www.rappler.com/section/technology/apps/page/2/
    On subtopic page: gadgets
        https://www.rappler.com/section/technology/gadgets/page/2/
        https://www.rappler.com/section/technology/gadgets/page/3/
        https://www.rappler.com/section/technology/gadgets/page/4/
        https://www.rappler.com/section/technology/gadgets/page/5/
    On subtopic page: gaming
        https://www.rappler.com/section/technology/gaming/page/2/
        https://www.rappler.com/section/technology/gaming/page/3/
        https://www.rappler.com/section/technology/gaming/page/4/
        https://www.rappler.com/section/technology/gaming/page/5/
        https://www.rappler.com/section/technology/gaming/page/6/
        https://www.rappler.com/section/technology/gaming/page/7/
        https://www.rappler.com/section/technology/gaming/page/8/
        https://www.rappler.com/section/technology/gaming/page/9/
    On subtopic page: innovations
        https://www.

  0%|          | 0/3 [00:00<?, ?it/s]

    On subtopic page: advocacies
        https://www.rappler.com/section/moveph/advocacies/page/2/
        https://www.rappler.com/section/moveph/advocacies/page/3/
        https://www.rappler.com/section/moveph/advocacies/page/4/
        https://www.rappler.com/section/moveph/advocacies/page/5/
        https://www.rappler.com/section/moveph/advocacies/page/6/
        https://www.rappler.com/section/moveph/advocacies/page/7/
    On subtopic page: agos
    On subtopic page: partners
On topic page: science


  0%|          | 0/4 [00:00<?, ?it/s]

    On subtopic page: discoveries & inventions
    On subtopic page: earth & space
        https://www.rappler.com/section/science/earth-space/page/2/
        https://www.rappler.com/section/science/earth-space/page/3/
        https://www.rappler.com/section/science/earth-space/page/4/
        https://www.rappler.com/section/science/earth-space/page/5/
        https://www.rappler.com/section/science/earth-space/page/6/
    On subtopic page: life & health
        https://www.rappler.com/section/science/life-health/page/2/
        https://www.rappler.com/section/science/life-health/page/3/
        https://www.rappler.com/section/science/life-health/page/4/
        https://www.rappler.com/section/science/life-health/page/5/
        https://www.rappler.com/section/science/life-health/page/6/
        https://www.rappler.com/section/science/life-health/page/7/
        https://www.rappler.com/section/science/life-health/page/8/
        https://www.rappler.com/section/science/life-health/page/

  0%|          | 0/4 [00:00<?, ?it/s]

    On subtopic page: climate change
        https://www.rappler.com/section/environment/climate-change/page/2/
        https://www.rappler.com/section/environment/climate-change/page/3/
        https://www.rappler.com/section/environment/climate-change/page/4/
        https://www.rappler.com/section/environment/climate-change/page/5/
        https://www.rappler.com/section/environment/climate-change/page/6/
        https://www.rappler.com/section/environment/climate-change/page/7/
        https://www.rappler.com/section/environment/climate-change/page/8/
        https://www.rappler.com/section/environment/climate-change/page/9/
        https://www.rappler.com/section/environment/climate-change/page/10/
        https://www.rappler.com/section/environment/climate-change/page/11/
        https://www.rappler.com/section/environment/climate-change/page/12/
    On subtopic page: disasters in the philippines
    On subtopic page: nature
        https://www.rappler.com/section/environment/nat

### Load Article Links Table

In [74]:
cols = ['Page Topic', 'Page Subtopic', 'Article Title', 'Article Topic', 'Article Link', 'Article Date', 'Retrieval Date']
articles_df = pd.DataFrame(articles, columns=cols)
articles_df = articles_df.drop_duplicates(['Article Title'])
articles_df = articles_df[articles_df['Article Topic'] != 'rappler+ exclusives']
print(articles_df.shape)
articles_df.head()

(19942, 7)


Unnamed: 0,Page Topic,Page Subtopic,Article Title,Article Topic,Article Link,Article Date,Retrieval Date
1,video,hold the line,#holdtheline: maria ressa talks to senator ris...,hold the line,https://www.rappler.com/video/hold-the-line-ma...,2022-06-07T02:00:00+00:00,2022-11-21T07:00:46+00:00
2,video,hold the line,full video: maria ressa talks to frances haugen,exclusive events,https://www.rappler.com/plus-membership-progra...,2022-03-25T05:52:25+00:00,2022-11-21T07:00:46+00:00
4,video,video editorial,[video editorial] huwag kalimutan si percy lapid,editorials,https://www.rappler.com/voices/editorials/vide...,2022-10-29T04:16:55+00:00,2022-11-21T07:00:46+00:00
5,video,video editorial,[video editorial] sandaang araw ni marcos: ‘de...,editorials,https://www.rappler.com/voices/editorials/vide...,2022-10-14T09:40:43+00:00,2022-11-21T07:00:46+00:00
6,video,video editorial,[video editorial] bakit ‘fail’ kapag influence...,editorials,https://www.rappler.com/voices/editorials/vide...,2022-10-07T11:02:40+00:00,2022-11-21T07:00:46+00:00


In [75]:
try:
    articles_df.to_sql('rappler_2022', conn, if_exists='fail', index=False)
except ValueError:
    print('Already in database.')

Already in database.


## 2022 Rappler Scitech Article Data

In [130]:
scitech_df = articles_df[(articles_df['Page Topic'] == 'technology') | (articles_df['Page Topic'] == 'science')]
scitech_links = scitech_df['Article Link'].to_numpy()
scitech_df = scitech_df[['Page Topic', 'Article Topic', 'Article Title']]
print(scitech_df.shape)
scitech_df.head()

(465, 3)


Unnamed: 0,Page Topic,Article Topic,Article Title
22732,technology,social media,google approves trump’s truth social for play ...
22733,technology,technology,"password manager lastpass reports breach, says..."
22734,technology,technology,tiktok refutes researcher’s claims that in-app...
22735,technology,apps,messaging app telegram to launch paid subscrip...
22736,technology,technology,grabmaps seeks to provide more accurate mappin...


### Webscrape Scitech Article Contents

In [132]:
# Get science and technology article summary and contents
scitech_summary = []
scitech_content = []

for link in tqdm(scitech_links):
    soup = bs4.BeautifulSoup(requests.get(link, proxies=proxies).text)
    summary = soup.select_one('div.post-single__summary').text.strip().casefold()
    summary = re.sub(r'\s', r' ', summary)
    scitech_summary.append(summary)
    
    content = soup.select_one('div.post-single__content').text.strip().casefold()
    content = re.sub(r'\s', r' ', content)
    scitech_content.append(content)

    time.sleep(np.random.choice(np.arange(1, 8)))

  0%|          | 0/465 [00:00<?, ?it/s]

### Load Scitech Article Contents

In [138]:
scitech_df['Summary'] = scitech_summary
scitech_df['Content'] = scitech_content
print(scitech_df.shape)
scitech_df.head()

(465, 5)


Unnamed: 0,Page Topic,Article Topic,Article Title,Summary,Content
22732,technology,social media,google approves trump’s truth social for play ...,"trump media & technology group, which operates...",alphabet inc’s google has approved former us p...
22733,technology,technology,"password manager lastpass reports breach, says...",lastpass ceo karim toubba says an investigatio...,"washington, dc, usa – popular digital password..."
22734,technology,technology,tiktok refutes researcher’s claims that in-app...,"security researcher felix krause says tiktok, ...",play video a new analysis revealed that ...
22735,technology,apps,messaging app telegram to launch paid subscrip...,'the only way to let our most demanding fans g...,messaging app telegram will launch a paid subs...
22736,technology,technology,grabmaps seeks to provide more accurate mappin...,grab is completing its move to use its own map...,"manila, philippines – grab on wednesday, june ..."


In [140]:
try:
    scitech_df.to_sql('rappler_scitech', conn, if_exists='fail', index=False)
except ValueError:
    print('Already in database.')

Already in database.
