In [1]:
import pandas as pd

from bs4 import BeautifulSoup

from tqdm import tqdm
tqdm.pandas()

from typing import List, Tuple, Union, Optional

In [21]:
SRC_DATA = '../../data/webpage_data/full_flipside_data.csv'
ARTICLE_SOURCE = '../../data/webpage_data/articles/'

OP_SEPARATOR = '||'

In [12]:
data = pd.read_csv(SRC_DATA)

# Rescraping Expert Opinions

In [13]:
# Create column for article file path
data['article_file'] = ARTICLE_SOURCE + data['title'].str.replace('?', '', regex = False) + '.html'

In [14]:
def get_expert_opinions(article_src: str) -> Tuple[str]:
    '''
        Scrape expert opinions from an article source html file
        
        Parameters:
            -article_src
                The source html of an article to extract expert opinions from
                
        Return:
            Lists of the left and right expert opinions found in the article
    '''
    
    article_src = article_src[:article_src.find(':')] if ':' in article_src else article_src
    with open(article_src, 'r', encoding = 'utf8') as f:
        src_html = f.read()
        
    bs = BeautifulSoup(src_html)
    
    right_experts = bs.find_all('div', {'class': lambda e: ('paragraph-6' in e and 'right' in e and 'bullet' in e and not ('w-dyn-bind-empty' in e)) if e else False})
    right_experts = [e.p.text for e in right_experts if e.p]
    
    left_experts = bs.find_all('div', {'class': lambda e: ('paragraph-6' in e and 'left' in e and 'bullet' in e and not ('w-dyn-bind-empty' in e)) if e else False})
    left_experts = [e.p.text for e in left_experts if e.p]
    
    right_experts = [e[:-1 * e[::-1].find('”')] for e in right_experts]
    left_experts = [e[:-1 * e[::-1].find('”')] for e in left_experts]
    
    right_experts = [e.replace('”', '').replace('“', '') for e in right_experts]
    left_experts = [e.replace('”', '').replace('“', '') for e in left_experts]
    
    return left_experts, right_experts

In [15]:
expert_ops = data['article_file'].progress_apply(get_expert_opinions)

100%|████████████████████████████████████████████████████████████████████████████████| 977/977 [00:17<00:00, 54.29it/s]


In [17]:
left_ops = [OP_SEPARATOR.join(tup[0]) for tup in expert_ops]
right_ops = [OP_SEPARATOR.join(tup[1]) for tup in expert_ops]

In [18]:
data['left_op'] = left_ops
data['right_op'] = right_ops

In [19]:
data = data.drop('article_file', axis = 1)

In [22]:
data.to_csv(SRC_DATA, index = None)