#Fake News Web Scraping

In [None]:
! nvidia-smi

In [None]:
! pip install requests
! pip install easyocr
! pip install opencv-python-headless==4.1.2.30

In [2]:
# Imports
import requests
import numpy as np
import pandas as pd
import easyocr
from typing import Dict

In [None]:
def get_verifications(page: int) -> Dict[str, str]:
    """Gets news verifications from fakehunter.pap.pl.

    Args:
        page (int): Page number of fakehunter.pap.pl to process.

    Returns:
        verifications (Dict[str, str]): Dictionary of web page content.
    """
    url = 'https://panel-api.fakehunter.pap.pl/news/published/news?category=koronawirus&domains%5B%5D=koronawirus&page=' + str(page)
    print(f'Processing page {page}: {url}')
    res = requests.get(url)  
    res.encoding = 'utf-8'                            
    data = res.json()
    verifications = data['results']
    return verifications


reader = easyocr.Reader(['pl'])  # Load the OCR Reader object


def extract_text_from_image(url: str) -> str:
    """Performs OCR on the screenshot of an article.

    Args:
        url (str): URL of the screenshot.

    Returns:
        text (str): Extracted text.
    """
    res = requests.get(url)
    arr = np.asarray(bytearray(res.content), dtype=np.uint8)
    img = cv2.imdecode(arr, -1)
    result = reader.readtext(img, decoder='beamsearch', detail=0, 
                            paragraph=True, y_ths=4, min_size=200, width_ths=1, 
                            allowlist='#0123456789ABCDEFGHIJKLŁMNOPRSŚTUVWXYZŻŹaąbcćdeęfghijklłmnńoóprstuwyzżź .,-?:-!"()')
    text = ' '.join(result)
    return text

In [None]:
articles_counter = 0
pages_to_scrap = 144

df = pd.DataFrame(columns=['Verdict', 'Title', 'Text', 'Url'])

for page in range(1, pages_to_scrap + 1):
    verifications = get_verifications(page)

    for idx, ver in enumerate(verifications):
        articles_counter += 1
        text = extract_text_from_image(ver['screenshot_url'])

        df.loc[20 * (page-1) + idx] = [ver['expert_opinion']['verdict'], 
                                       ver['title'], text, ver['url']]
    
df.to_excel('fakehunter_dataset.xlsx', encoding='utf-8', index=False)
display(df.head())
print(f'Scraped articles in total: {len(df)}')

# OCR example

In [None]:
import cv2
from google.colab.patches import cv2_imshow

res = requests.get('https://sfnf-collector-prod.s3.amazonaws.com/c6dee03b-1547-4ef4-af42-3fc27aa69c0b.jpg')
arr = np.asarray(bytearray(res.content), dtype=np.uint8)
img = cv2.imdecode(arr, -1)
results = reader.readtext(img, decoder='beamsearch', y_ths=4, paragraph=True, min_size=200, width_ths=1)

# text = ' '.join(result)
for (bbox, text) in results:
    print("{:.4f}: {}".format(prob, text))
    (tl, tr, br, bl) = bbox
    tl = (int(tl[0]), int(tl[1]))
    tr = (int(tr[0]), int(tr[1]))
    br = (int(br[0]), int(br[1]))
    bl = (int(bl[0]), int(bl[1]))
    cv2.rectangle(img, tl, br, (0, 0, 255), 2) 
cv2_imshow(img)