### Imports

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import re

In [2]:
scrape_page_num = 20
pause_sec = 3

### Scrape Latest Fact-checks page (https://www.politifact.com/factchecks/?page=1)

In [3]:
# create df for later use when adding to chroma
politifact_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])
politifact_df

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification


In [4]:
# loop through pages of the politifact website
for i in range(1,scrape_page_num):
    # create temp df for each page
    temp_politifact_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])

    # request and load text
    politifac_request = requests.get("https://www.politifact.com/factchecks/?page={}".format(i))
    text = BeautifulSoup(politifac_request.text)

    # context
    context = text.find_all('a', attrs={'class': 'm-statement__name'})
    all_context = []
    for c in context:
        c = c.text
        c = c.lower()
        c = re.search("[a-z]+.{1}[a-z]+", c)
        c = c.group()
        all_context.append(c)
    
    # Statement
    statement = text.find_all('div', attrs={'class':'m-statement__quote'})
    all_statement = []
    for s in statement:
        s = s.text
        s = s.lower()
        s = re.sub(r"\s\s+","",s)
        s = re.sub(r"[^a-z^\d^+^ ^.]","", s)
        if s:
            all_statement.append(s)
        else:
            all_statement.append("none")
    
    # label
    label = text.find_all('div', attrs={'class':'m-statement__meter'})
    all_label = []
    for l in label:
        l = l.find('img').get('alt')
        all_label.append(l)
    
    
    # author
    author = text.find_all('footer', attrs={'class':'m-statement__footer'})
    all_author = []
    for a in author:
        a = a.text
        a = a.lower()
        a = re.sub("by", "", a)
        a = re.search("[a-z]+.{1}[a-z']+", a)
        a = a.group()
        all_author.append(a)

    # id
    all_id = list(range(len(all_author)))
    
    # create df
    temp_politifact_df['ID'] = all_id
    temp_politifact_df['speaker'] = all_author
    temp_politifact_df['label'] = all_label
    temp_politifact_df['statement'] = all_statement
    temp_politifact_df['context'] = all_context
    temp_politifact_df['extracted_justification'] = all_statement
    
    politifact_df = pd.concat([politifact_df, temp_politifact_df], sort=False)
    # # sleep
    time.sleep(pause_sec)


In [5]:
politifact_df.head(3)

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification
0,0,False,elon musk sues the view and whoopi goldberg fo...,,ciara o'rourke,,,,,,,,,facebook posts,elon musk sues the view and whoopi goldberg fo...
1,1,False,video shows floods in spain and france.,,marta campabadal,,,,,,,,,facebook posts,video shows floods in spain and france.
2,2,False,elon musk thinks that posting what you eat is ...,,mia penner,,,,,,,,,threads posts,elon musk thinks that posting what you eat is ...


### scrape Latest Promises Page (https://www.politifact.com/truth-o-meter/promises/list/?page=1&)

In [6]:
# create df for later use when adding to chroma
truth_o_meter_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])
truth_o_meter_df

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification


In [8]:
# loop through pages of the politifact website
for i in range(1,scrape_page_num):
    # create temp df for each page
    temp_truth_o_meter_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])

    truth_o_meter_request = requests.get("https://www.politifact.com/truth-o-meter/promises/list/?page={}&".format(i))
    truth_o_meter_text = BeautifulSoup(truth_o_meter_request.text)

    all_truthfulness = []
    all_statement = []
    all_author = []
    all_justification = []
    all_id = []
    
    each_page = truth_o_meter_text.find_all('div', attrs={'class': 'o-listing__item'})

    for each_block in each_page:
        # get truthfulness
        # false -> spectr -> half-true -> true
        truthfulness = each_block.find('article').attrs['class'][-1]
        truthfulness = truthfulness.replace("m-statement--","")
        all_truthfulness.append(truthfulness)
        
        # get statement
        statement = each_block.find('a', attrs={'class': 'm-statement__name'})
        statement = statement['title']
        all_statement.append(statement)
        
        # get author
        author = each_block.find('a', attrs={'class': 'm-statement__name'})
        author = author.text
        author = author.lower()
        author = re.search("[a-z]+.{1}[a-z]+", author)
        author = author.group()
        all_author.append(author)
        
        # get justification
        justification = each_block.find('div', attrs = {'class': 'm-statement__quote'})
        justification = justification.find('a').text
        justification = justification.lower()
        justification = re.search("[a-z]+.+", justification)
        justification = justification.group()
        all_justification.append(justification)
    
    # get id
    id_num = list(range(len(all_author)))
    all_id.append(id_num)

    # create df
    temp_truth_o_meter_df['ID'] = all_id[0]
    temp_truth_o_meter_df['speaker'] = all_author
    temp_truth_o_meter_df['label'] = all_truthfulness
    temp_truth_o_meter_df['statement'] = all_statement
    temp_truth_o_meter_df['extracted_justification'] = all_justification
    
    truth_o_meter_df = pd.concat([truth_o_meter_df, temp_truth_o_meter_df], sort=False)
    # sleep
    time.sleep(pause_sec)

In [9]:
truth_o_meter_df.head(3)

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification
0,0,true,Promise Kept: Planned Parenthood regained fede...,,joe biden,,,,,,,,,,restore federal funding for planned parenthood
1,1,true,"Biden increases funding, announces strategic p...",,joe biden,,,,,,,,,,expand services for people during and after in...
2,2,half-true,"Affordable Connectivity Program loses funding,...",,joe biden,,,,,,,,,,"expand broadband, or wireless broadband via 5g..."


### clean up both data frames to fill in missing values

In [374]:
# politifact_df.head(1)

In [375]:
# politifact_df.speaker.unique()

In [376]:
# temp = politifact_df.groupby(['speaker','label']).count().reset_index()
# test = temp[temp['speaker'] == 'aaron sharockman']
# test
# temp

### Add both dataframe to chormadb

In [10]:
import chromadb

In [13]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [14]:
# data preprocessing
train_data = politifact_df
train_data.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification']
train_data = train_data[(train_data['speaker'].notna()) & (train_data['label'].notna())].reset_index(drop=True)
train_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification
0,0,False,elon musk sues the view and whoopi goldberg fo...,,ciara o'rourke,,,,,,,,,facebook posts,elon musk sues the view and whoopi goldberg fo...
1,1,False,video shows floods in spain and france.,,marta campabadal,,,,,,,,,facebook posts,video shows floods in spain and france.
2,2,False,elon musk thinks that posting what you eat is ...,,mia penner,,,,,,,,,threads posts,elon musk thinks that posting what you eat is ...
3,3,False,mike tyson posed with the palestinian flag bef...,,kwasi gyamfi,,,,,,,,,facebook posts,mike tyson posed with the palestinian flag bef...
4,4,False,says sen. bob casey dpa. is trying to change t...,,caleb mccullough,,,,,,,,,elon musk,says sen. bob casey dpa. is trying to change t...


In [16]:
collection = chroma_client.get_or_create_collection(name="Misinformation")

In [28]:
documents = []
metadatas = []
ids = []
prev_df_size = collection.count()
prev_df_plus_politifact_df_size = train_data.shape[0] + prev_df_size
for i in range(prev_df_size, prev_df_plus_politifact_df_size):
    documents.append(train_data.loc[i - prev_df_size, 'statement'])
    metadatas.append({"label": train_data.loc[i - prev_df_size, 'label'], "speaker": train_data.loc[i - prev_df_size, "speaker"], "party_affliation": train_data.loc[i - prev_df_size, "party_affliation"], "justification": train_data.loc[i - prev_df_size, "extracted_justification"], })
    ids.append("id" + str(i))

In [29]:
collection.add(documents = documents, 
               metadatas=metadatas, 
               ids=ids)

In [None]:
# data preprocessing for truth_o_meter_df
train_data = truth_o_meter_df
train_data.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification']
train_data = train_data[(train_data['speaker'].notna()) & (train_data['label'].notna())].reset_index(drop=True)

# if else statement to prevent adding same doc to docker if distances == 0
documents = []
metadatas = []
ids = []
prev_df_size = collection.count()
prev_df_plus_politifact_df_size = train_data.shape[0] + prev_df_size
for i in range(prev_df_size, prev_df_plus_politifact_df_size):
    documents.append(train_data.loc[i - prev_df_size, 'statement'])
    metadatas.append({"label": train_data.loc[i - prev_df_size, 'label'], "speaker": train_data.loc[i - prev_df_size, "speaker"], "party_affliation": train_data.loc[i - prev_df_size, "party_affliation"], "justification": train_data.loc[i - prev_df_size, "extracted_justification"], })
    ids.append("id" + str(i))

collection = chroma_client.get_or_create_collection(name="Misinformation")

collection.add(documents = documents, 
               metadatas=metadatas, 
               ids=ids)

In [35]:
# Test query to ensure it works!
results = collection.query(query_texts=["Promise Kept: Planned Parenthood regained federal funding under Biden administration"], 
                 n_results=3,
               #   where=
               #   {
               #      "label": "true"
               #   })
)
print(results['documents'])
print(results["distances"])

[['Promise Kept: Planned Parenthood regained federal funding under Biden administration', 'Promise Kept: Planned Parenthood regained federal funding under Biden administration', 'Trump rule change prods Planned Parenthood to forgo federal funds']]
[[0.0, 0.0, 0.6132704019546509]]


### Citation:

https://www.w3schools.com/python/python_regex.asp

https://github.com/srcole/politifact-analysis/blob/master/Politifact%20-%201a%20-%20scrape%20fact%20check%20data.ipynb

https://www.geeksforgeeks.org/extracting-an-attribute-value-with-beautifulsoup-in-python/