### Imports

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import re

In [None]:
scrape_page_num = 20
pause_sec = 3

### Scrape Latest Fact-checks page (https://www.politifact.com/factchecks/?page=1)

In [None]:
# create df for later use when adding to chroma
politifact_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])
politifact_df

In [None]:
# loop through pages of the politifact website
for i in range(1,scrape_page_num):
    # create temp df for each page
    temp_politifact_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])

    # request and load text
    politifac_request = requests.get("https://www.politifact.com/factchecks/?page={}".format(i))
    text = BeautifulSoup(politifac_request.text)

    # context
    context = text.find_all('a', attrs={'class': 'm-statement__name'})
    all_context = []
    for c in context:
        c = c.text
        c = c.lower()
        c = re.search("[a-z]+.{1}[a-z]+", c)
        c = c.group()
        all_context.append(c)
    
    # Statement
    statement = text.find_all('div', attrs={'class':'m-statement__quote'})
    all_statement = []
    for s in statement:
        s = s.text
        s = s.lower()
        s = re.sub(r"\s\s+","",s)
        s = re.sub(r"[^a-z^\d^+^ ^.]","", s)
        if s:
            all_statement.append(s)
        else:
            all_statement.append("none")
    
    # label
    label = text.find_all('div', attrs={'class':'m-statement__meter'})
    all_label = []
    for l in label:
        l = l.find('img').get('alt')
        all_label.append(l)
    
    
    # author
    author = text.find_all('footer', attrs={'class':'m-statement__footer'})
    all_author = []
    for a in author:
        a = a.text
        a = a.lower()
        a = re.sub("by", "", a)
        a = re.search("[a-z]+.{1}[a-z']+", a)
        a = a.group()
        all_author.append(a)

    # id
    all_id = list(range(len(all_author)))
    
    # create df
    temp_politifact_df['ID'] = all_id
    temp_politifact_df['speaker'] = all_author
    temp_politifact_df['label'] = all_label
    temp_politifact_df['statement'] = all_statement
    temp_politifact_df['context'] = all_context
    temp_politifact_df['extracted_justification'] = all_statement
    
    politifact_df = pd.concat([politifact_df, temp_politifact_df], sort=False)
    # # sleep
    time.sleep(pause_sec)


In [None]:
politifact_df.head(3)

### scrape Latest Promises Page (https://www.politifact.com/truth-o-meter/promises/list/?page=1&)

In [None]:
# create df for later use when adding to chroma
truth_o_meter_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])
truth_o_meter_df

In [None]:
# loop through pages of the politifact website
for i in range(1,scrape_page_num):
    # create temp df for each page
    temp_truth_o_meter_df = pd.DataFrame(columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification'])

    truth_o_meter_request = requests.get("https://www.politifact.com/truth-o-meter/promises/list/?page={}&".format(i))
    truth_o_meter_text = BeautifulSoup(truth_o_meter_request.text)

    all_truthfulness = []
    all_statement = []
    all_author = []
    all_justification = []
    all_id = []
    
    each_page = truth_o_meter_text.find_all('div', attrs={'class': 'o-listing__item'})

    for each_block in each_page:
        # get truthfulness
        # false -> spectr -> half-true -> true
        truthfulness = each_block.find('article').attrs['class'][-1]
        truthfulness = truthfulness.replace("m-statement--","")
        all_truthfulness.append(truthfulness)
        
        # get statement
        statement = each_block.find('a', attrs={'class': 'm-statement__name'})
        statement = statement['title']
        all_statement.append(statement)
        
        # get author
        author = each_block.find('a', attrs={'class': 'm-statement__name'})
        author = author.text
        author = author.lower()
        author = re.search("[a-z]+.{1}[a-z]+", author)
        author = author.group()
        all_author.append(author)
        
        # get justification
        justification = each_block.find('div', attrs = {'class': 'm-statement__quote'})
        justification = justification.find('a').text
        justification = justification.lower()
        justification = re.search("[a-z]+.+", justification)
        justification = justification.group()
        all_justification.append(justification)
    
    # get id
    id_num = list(range(len(all_author)))
    all_id.append(id_num)

    # create df
    temp_truth_o_meter_df['ID'] = all_id[0]
    temp_truth_o_meter_df['speaker'] = all_author
    temp_truth_o_meter_df['label'] = all_truthfulness
    temp_truth_o_meter_df['statement'] = all_statement
    temp_truth_o_meter_df['extracted_justification'] = all_justification
    
    truth_o_meter_df = pd.concat([truth_o_meter_df, temp_truth_o_meter_df], sort=False)
    # sleep
    time.sleep(pause_sec)

In [None]:
truth_o_meter_df.head(3)

### merge the two df

In [None]:
final_df = pd.concat([politifact_df, truth_o_meter_df], ignore_index=True)
final_df['ID'] = final_df.index
final_df.tail()

### Add both dataframe to chormadb

In [None]:
import chromadb

In [None]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [None]:
# data preprocessing
train_data = final_df
train_data.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification']
train_data = train_data[(train_data['speaker'].notna()) & (train_data['label'].notna())].reset_index(drop=True)

In [None]:
# get chroma client
collection = chroma_client.get_or_create_collection(name="Misinformation")

In [None]:
# if else statement to prevent adding same doc to docker if distances == 0
all_statements = train_data['statement']
for i in range(train_data.shape[0]):
    dist = collection.query(query_texts=[all_statements[i]], n_results=1)
    dist = dist["distances"][0][0]
    if dist == 0:
        train_data = train_data.drop([i])

In [None]:
# put non-repetitive data into documents, metadatas, ids lists.
documents = []
metadatas = []
ids = []
prev_df_size = collection.count()
prev_df_plus_politifact_df_size = train_data.shape[0] + prev_df_size
for i in range(prev_df_size, prev_df_plus_politifact_df_size):
    documents.append(train_data.loc[i - prev_df_size, 'statement'])
    metadatas.append({"label": train_data.loc[i - prev_df_size, 'label'], "speaker": train_data.loc[i - prev_df_size, "speaker"], "party_affliation": train_data.loc[i - prev_df_size, "party_affliation"], "justification": train_data.loc[i - prev_df_size, "extracted_justification"], })
    ids.append("id" + str(i))

In [None]:
# add those data to collection
collection.add(documents = documents, 
               metadatas=metadatas, 
               ids=ids)

In [None]:
# Test query to ensure it works! Returns top 3 closest statements from our data to the text imputted!
results = collection.query(query_texts=["Promise Kept: Planned Parenthood regained fede"], 
                 n_results=3,
               #   where=
               #   {
               #      "label": "true"
               #   })
)
print(results['documents'])
print(results["distances"])

### Citation:

https://www.w3schools.com/python/python_regex.asp

https://github.com/srcole/politifact-analysis/blob/master/Politifact%20-%201a%20-%20scrape%20fact%20check%20data.ipynb

https://www.geeksforgeeks.org/extracting-an-attribute-value-with-beautifulsoup-in-python/