In [1]:
import aiohttp
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
import re 
from datetime import datetime
import csv
import hopsworks

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
async def fetch_page(session, url):
    print(f"Fetching URL: {url}")
    async with session.get(url) as response:
        return await response.text()

async def parse_main_page(url):
    async with aiohttp.ClientSession() as session:
        print(f"Processing main page: {url}")
        main_page_content = await fetch_page(session, url)
        soup = BeautifulSoup(main_page_content, 'html.parser')
        links = soup.find_all('li', attrs={'class': 'o-listicle__item'})
        print(f"Found {len(links)} links to process")
        tasks = [parse_inner_page(session, link) for link in links]
        return await asyncio.gather(*tasks)

async def parse_inner_page(session, link_element):
    base_link = "https://www.politifact.com"
    inner_link = base_link + link_element.find("div", attrs={'class': 'm-statement__quote'}).find('a')['href'].strip()
    print(f"Processing inner link: {inner_link}")
    inner_page_content = await fetch_page(session, inner_link)
    inner_soup = BeautifulSoup(inner_page_content, 'html.parser')

    try:
        statement_tag = inner_soup.find("div", attrs={'class': 'm-statement__quote'})
        print("Statement tag found:", statement_tag is not None)
        statement = statement_tag.text.strip() if statement_tag else 'N/A'

        date_element = inner_soup.find('div', attrs={'class': 'm-statement__desc'})
        print("Date element found:", date_element is not None)
        if date_element:
            date_match = re.search(r"(\w+\s\d{1,2},\s\d{4})", date_element.text)
            if date_match:
                date_str = date_match.group(1)
                # Parse the date string into a datetime object
                date_obj = datetime.strptime(date_str, '%B %d, %Y')
                # Format the date as YYYY-MM-DD
                date = date_obj.strftime('%Y-%m-%d')
            else:
                date = 'N/A'
        else:
            date = 'N/A'
        print(date)
        source_tag = inner_soup.find('div', attrs={'class': 'm-statement__meta'}).find('a')
        print("Source tag found:", source_tag is not None)
        source = source_tag.text.strip() if source_tag else 'N/A'

        label_tag = inner_soup.find('div', attrs={'class': 'm-statement__content'}).find('img',
                                                                                        attrs={
                                                                                            'class': 'c-image__original'})
        print("Label tag found:", label_tag is not None)
        label = label_tag.get('alt').strip() if label_tag else 'N/A'
    except Exception as e:
        print(f"Error processing inner link {inner_link}: {e}")
        return 'N/A', inner_link, 'N/A', 'N/A', 'N/A'

    return (statement, inner_link, date, source, label)

async def main():
    pages_to_get = 20
    all_data = []
    for page_number in range(1, pages_to_get + 1):
        print('Processing page:', page_number)
        url = f'https://www.politifact.com/factchecks/list/?page={page_number}'
    
        try:
            print(f"Starting data collection for the {page_number} page")
            data = await parse_main_page(url)
            all_data.extend(data)
            
        except Exception as e:
            print('Error occurred:', e)

    return pd.DataFrame(all_data, columns = ['statement', 'inner_link', 'date', 'source', 'label'])

In [6]:
if asyncio.get_event_loop().is_running():
    # Run in the existing loop
    result = await main()  # This should be in an async context
else:
    # Create a new event loop
    result = asyncio.run(main())


Processing page: 1
Starting data collection for the 1 page
Processing main page: https://www.politifact.com/factchecks/list/?page=1
Fetching URL: https://www.politifact.com/factchecks/list/?page=1
Found 30 links to process
Processing inner link: https://www.politifact.com/factchecks/2024/jan/01/nick-lalota/nys-clean-slate-law-doesnt-keep-convictions-sealed/
Fetching URL: https://www.politifact.com/factchecks/2024/jan/01/nick-lalota/nys-clean-slate-law-doesnt-keep-convictions-sealed/
Processing inner link: https://www.politifact.com/factchecks/2023/dec/29/americans-prosperity/wisconsin-public-service-commission-has-approved-r/
Fetching URL: https://www.politifact.com/factchecks/2023/dec/29/americans-prosperity/wisconsin-public-service-commission-has-approved-r/
Processing inner link: https://www.politifact.com/factchecks/2023/dec/22/robin-vos/did-wisconsins-governor-reject-iowa-modeled-redist/
Fetching URL: https://www.politifact.com/factchecks/2023/dec/22/robin-vos/did-wisconsins-gover

In [7]:
result.head()

Unnamed: 0,statement,inner_link,date,source,label
0,Under New York’s Clean Slate Act “violent crim...,https://www.politifact.com/factchecks/2024/jan...,2023-11-18,Nick LaLota,barely-true
1,“Wisconsin utilities have charged ratepayers m...,https://www.politifact.com/factchecks/2023/dec...,2023-12-04,Americans for Prosperity,false
2,"Gov. Tony Evers and Democrats ""rejected our (I...",https://www.politifact.com/factchecks/2023/dec...,2023-09-15,Robin Vos,mostly-true
3,Elon Musk invented an energy-saving device tha...,https://www.politifact.com/factchecks/2023/dec...,2023-12-17,Facebook posts,pants-fire
4,Hunter Biden’s “prison term announced.”,https://www.politifact.com/factchecks/2023/dec...,2023-12-19,Facebook posts,false


In [8]:
project = hopsworks.login()
fs = project.get_feature_store()
fg = fs.get_or_create_feature_group(
    name="finalproj",
    version=1,
    primary_key=["statement"], 
    description="final project dataset")
fg.insert(result)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/221335
Connected. Call `.close()` to terminate connection gracefully.


Uploading Dataframe: 100.00% |██████████| Rows 600/600 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: finalproj_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/221335/jobs/named/finalproj_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x2893ad5d0>, None)