In [88]:
import pandas as pd
import numpy as np
import pickle

from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow, Flow
from google.auth.transport.requests import Request
import json

import re
import string
import datetime

from joblib import dump, load, parallel_backend
import os

RANDOM = 42

In [56]:
with open('data/googlefactcheck.key', 'r') as key_file:
    key = key_file.read()

In [57]:
key

'AIzaSyA5cBLSCeRjt0aq_phXm2UGyJ3HKO-iVIY'

In [35]:
creds = None
SCOPE = ["https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.email", "openid"]

# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
    
if os.path.exists('data/token.pickle'):
    with open('data/token.pickle', 'rb') as token:
        creds = pickle.load(token)
        
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('data/googlefactcheck_program.json', SCOPE)
        creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open('data/token.pickle', 'wb') as token:
        pickle.dump(creds, token)

In [60]:
service = build('factchecktools', 'v1alpha1', credentials=creds)

In [70]:
first_request = {'languageCode': 'en',
                 'reviewPublisherSiteFilter': 'politifact.com', 
                 'pageSize': 100,
                 'key': key }

In [72]:
import requests

r = requests.get(url = "https://factchecktools.googleapis.com/v1alpha1/claims:search", params = first_request)

In [107]:
next_request = {'languageCode': 'en',
                 'reviewPublisherSiteFilter': 'politifact.com', 
                 'pageSize': 100,
                 'pageToken': dict(r.json())['nextPageToken'],
                 'key': key }

r = requests.get(url = "https://factchecktools.googleapis.com/v1alpha1/claims:search", params = next_request)

In [108]:
dict(r.json())

{'claims': [{'text': "Speaking about border apprehensions, said “We have some of the best numbers we've ever had. Southern border, the best numbers we've ever had.”",
   'claimant': 'Donald Trump',
   'claimDate': '2020-05-14T00:00:00Z',
   'claimReview': [{'publisher': {'name': 'PolitiFact',
      'site': 'politifact.com'},
     'url': 'https://www.politifact.com/factchecks/2020/may/20/donald-trump/fact-checking-donald-trumps-claim-best-numbers-bor/',
     'title': "Fact-checking Donald Trump's claim of 'best numbers' at the border",
     'textualRating': 'Half True',
     'languageCode': 'en'}]},
  {'text': '“If you look at the one (hydroxychloroquine) survey, the only bad survey, they were giving it to people that were in very bad shape. They were very old. Almost dead.”',
   'claimant': 'Donald Trump',
   'claimDate': '2020-05-19T00:00:00Z',
   'claimReview': [{'publisher': {'name': 'PolitiFact',
      'site': 'politifact.com'},
     'url': 'https://www.politifact.com/factchecks/20

In [98]:
datetime.datetime.strptime(dict(r.json())['claims'][0]['claimDate'], '%Y-%m-%dT%H:%M:%SZ')

datetime.datetime(2020, 5, 21, 0, 0)

In [178]:
politifact_dictlist = []

for claim in dict(r.json())['claims']:
    politifact_dictentry = {
        'statement': claim['text'],
        'label':     claim['claimReview'][0]['textualRating'],
        'date':      datetime.datetime.strptime(claim.get('claimDate'), '%Y-%m-%dT%H:%M:%SZ'),
        'speaker':   claim.get('claimant')
    }
    politifact_dictlist.append(politifact_dictentry)

pd.DataFrame(politifact_dictlist)

Unnamed: 0,statement,label,date,speaker
0,"Speaking about border apprehensions, said “We ...",Half True,2020-05-14,Donald Trump
1,“If you look at the one (hydroxychloroquine) s...,False,2020-05-19,Donald Trump
2,"If you paid rent in March or April, “they got ...",False,2020-04-22,Facebook posts
3,Says GOP Wisconsin Sen. Patrick Testin is resp...,Mostly False,2020-05-08,Wisconsin State Senate
4,“Due to the large number of people who will re...,False,2020-05-13,Facebook posts
...,...,...,...,...
95,"“The flu killed 80,000 people in the U.S. last...",Mostly False,2020-04-18,Facebook
96,"On potential deaths: ""Those models that you're...",Mitigation included,2020-05-05,Donald Trump
97,"""Although intuitively I think it probably seem...",False,2020-05-04,Laura Ingraham
98,"“At the end of the year, we put into that fede...",Half True,2020-04-23,Andrew Cuomo


In [82]:
dict(r.json())['nextPageToken']

'CGQ'

In [129]:
from time import sleep

In [156]:
df=pd.DataFrame(columns=['statement', 'label', 'date', 'speaker'])

In [202]:
def process_claimentries(json_input):
    
    claims_list = []
    if not dict(json_input).get('claims'):
        return []
    
    for claim in dict(json_input)['claims']:
        dictentry = {
            'statement': claim['text'],
            'label':     claim['claimReview'][0]['textualRating'],
            'date':      datetime.datetime.strptime(claim.get('claimDate'), 
                                                    '%Y-%m-%dT%H:%M:%SZ') if claim.get('claimDate') else None,
            'speaker':   claim.get('claimant')
        }
        claims_list.append(dictentry)
    
    return claims_list
    
def factchecks_bysite(factchecksite, num_entries, app_key, chunk_size=100, language='en', verbose=True):
    
    entries_left = num_entries
    
    first_request = {'languageCode': language,
                     'reviewPublisherSiteFilter': factchecksite, 
                     'pageSize': min(chunk_size, num_entries),
                     'key': app_key }
    entries_left -= chunk_size
    
    if verbose:
        print(f'Retrieving first {chunk_size} claims out of {num_entries}, {entries_left} claims left.')
    
    r = requests.get(url = "https://factchecktools.googleapis.com/v1alpha1/claims:search", params = first_request)
    
    df_list = [pd.DataFrame(process_claimentries(r.json()))]
        
    while (entries_left > 0) and dict(r.json()).get('nextPageToken'):
        print('Done. Pausing for 5 seconds.')
        sleep(5)
        
        next_request = {'languageCode': 'en',
                        'reviewPublisherSiteFilter': 'politifact.com', 
                        'pageSize': min(chunk_size, entries_left),
                        'pageToken': dict(r.json())['nextPageToken'],
                        'key': key }
        
        entries_left -= chunk_size
        print(f'Retrieving next {chunk_size} claims out of {num_entries}, {entries_left} claims left.')
        
        r = requests.get(url = "https://factchecktools.googleapis.com/v1alpha1/claims:search", params = next_request)
        
        df_list.append(pd.DataFrame(process_claimentries(r.json())))

    print('All done.')
    return pd.concat(df_list, ignore_index=True)

In [203]:
politifact_df = factchecks_bysite('politifact.com', 10, key, chunk_size=5)

Retrieving first 5 claims out of 10, 5 claims left.
Done. Pausing for 5 seconds.
Retrieving next 5 claims out of 10, 0 claims left.
All done.


In [204]:
politifact_df

Unnamed: 0,statement,label,date,speaker
0,“Research illustrates a clear correlation betw...,True,2020-05-21,Glenn Grothman
1,“There were not chemical irritants” used to cl...,Pants on Fire,2020-06-07,William Barr
2,A photo shows “the Lincoln Memorial. His head ...,False,2020-06-06,Viral image
3,“These ‘peaceful’ protesters burned these hors...,False,2020-06-01,Facebook posts
4,“The 9/11 Memorial in NYC was just defaced wit...,False,2020-06-05,Facebook posts
5,“She’s just casually carrying a body with 1 ha...,False,2020-06-03,Viral image
6,“Trump’s star on the Hollywood Walk of Fame ha...,False,2020-06-03,Viral image
7,"""Last year, there were nine unarmed black peop...",Mostly False,2020-06-02,Larry Elder
8,The man pictured in a mugshot is not Derek Cha...,False,2020-06-04,Viral image
9,“This man who was kicked and stomped” was kill...,False,2020-06-03,Facebook post
