#Fake News Web Scraping

In [None]:
! nvidia-smi

Tue Dec 21 02:04:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    58W / 149W |    758MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! pip install requests
! apt-get update
! pip install easyocr

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:14 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic 

In [None]:
# Imports
import requests
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import easyocr
import cv2

reader = easyocr.Reader(['pl']) # load the OCR model

In [None]:
def get_verifications(page):
    '''Gets news verifications from fakehunter.pap.pl.

    Args:
        page (int): Page number of fakehunter.pap.pl to process. 
    
    Returns:
        verifications (dict): Dictionary of web page content.
    '''
    
    url = 'https://panel-api.fakehunter.pap.pl/news/published/news?category=koronawirus&domains%5B%5D=koronawirus&page=' + str(page)
    print(f'Processing page {page}: {url}')

    req = requests.get(url)  
    req.encoding = 'utf-8'                            
    data = req.json()
    verifications = data['results']

    return verifications

def extract_text_from_image(url):
    '''Performs OCR on the screenshot of an article.
    
    Args:
        url (str): URL of the screenshot.
    
    Returns:
        text (str): Extracted text.
    '''

    response = urllib.request.urlopen(url)
    arr = np.asarray(bytearray(response.read()), dtype=np.uint8)
    img = cv2.imdecode(arr, -1)
    result = reader.readtext(img, decoder='beamsearch', detail=0, 
                            paragraph=True, y_ths=4, min_size=200, width_ths=1, 
                            allowlist='#0123456789ABCDEFGHIJKLŁMNOPRSŚTUVWXYZŻŹaąbcćdeęfghijklłmnńoóprstuwyzżź .,-?:-!"()')
    text = ' '.join(result)
    
    return text

In [None]:
articles_counter = 0
pages_to_scrap = 59

df = pd.DataFrame(columns=['Verdict', 'Title', 'Text', 'Url'])

for page in range(1, pages_to_scrap + 1):
    verifications = get_verifications(page)

    for idx, ver in enumerate(verifications):
        articles_counter += 1
        if 'twitter' not in ver['url']: 
            text = extract_text_from_image(ver['screenshot_url'])
        else:
            text = ''

        df.loc[20 * (page-1) + idx] = [ver['expert_opinion']['verdict'], 
                                       ver['title'], text, ver['url']]
    
df.to_excel('fakehunter_dataset.xlsx', encoding='utf-8', index=False)
display(df.head())
print(f'Scraped articles in total: {len(df)}')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_excel('fakehunter_dataset.xlsx', encoding='utf-8', index=False)

plt.hist(df['Text'].astype(str).apply(len), bins=range(0, max(df['Text'].astype(str).apply(len)), 400))
plt.xticks(range(0, max(df['Text'].astype(str).apply(len)), 400))
plt.show()
