In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/factchecker_bias

/content/drive/MyDrive/factchecker_bias


In [5]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time
import pickle
import pandas as pd
import json

tqdm.pandas()

## Organization-based: Google fact check tools feed data (ClaimReview template)


In [33]:
o = json.load(open('data.json', 'r'))

In [61]:
feed_list = []
for i in tqdm(o['dataFeedElement']):
    try:
        if 'nytimes.com' in i['url'] or 'usatoday.com' in i['url'] or 'washingtonpost.com' in i['url'] or 'dispatch.com' in i['url'] or 'factcheck.org' in i['url']:
            feed_list.append(i)
    except:
        pass

  0%|          | 0/36552 [00:00<?, ?it/s]

In [62]:
def use_first(x):
    try:
        return pd.Series(x[0])
    except:
        return pd.Series(dtype='object')

df = pd.DataFrame(feed_list).drop(['@type'], axis=1)
df_item = df['item'].progress_apply(use_first)
df_reviewRating = df_item['reviewRating'].progress_apply(pd.Series).add_prefix('reviewRating_')

df_itemReviewed = df_item['itemReviewed'].progress_apply(pd.Series).add_prefix('itemReviewed_')
df_itemReviewed_appearance = df_itemReviewed['itemReviewed_appearance'].progress_apply(pd.Series).add_prefix('itemReviewed_appearance_')
df_itemReviewed_author = df_itemReviewed['itemReviewed_author'].progress_apply(pd.Series).add_prefix('itemReviewed_author_')
df_itemReviewed_firstAppearance = df_itemReviewed['itemReviewed_firstAppearance'].progress_apply(pd.Series).add_prefix('itemReviewed_firstAppearance_')
df_itemReviewed = df_itemReviewed.drop(['itemReviewed_appearance', 'itemReviewed_author', 'itemReviewed_firstAppearance'], axis=1)

df_itemReviewed = pd.concat([df_itemReviewed, df_itemReviewed_appearance, df_itemReviewed_author, df_itemReviewed_firstAppearance], axis=1)
df_sdPublisher = df_item['sdPublisher'].progress_apply(pd.Series).add_prefix('sdPublisher_')
df_author = df_item['author'].progress_apply(pd.Series).add_prefix('author_')
df_item = df_item.add_prefix('item_')

df_concat = pd.concat([df.drop(['item'], axis=1), 
                       df_item.drop(['item_reviewRating', 'item_itemReviewed', 'item_sdPublisher', 'item_author'], axis=1), 
                       df_reviewRating, df_itemReviewed, df_sdPublisher, df_author], axis=1)

  0%|          | 0/1935 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

  0%|          | 0/1935 [00:00<?, ?it/s]

In [51]:
df_concat.keys()

Index(['dateCreated', 'url', 'dateModified', 'item_@context', 'item_@type',
       'item_claimReviewed', 'item_url', 'item_datePublished',
       'reviewRating_@type', 'reviewRating_alternateName',
       'reviewRating_bestRating', 'reviewRating_image',
       'reviewRating_ratingExplanation', 'reviewRating_ratingValue',
       'reviewRating_worstRating', 'itemReviewed_0', 'itemReviewed_@type',
       'itemReviewed_datePublished', 'itemReviewed_name',
       'itemReviewed_appearance_0', 'itemReviewed_appearance_1',
       'itemReviewed_appearance_2', 'itemReviewed_appearance_3',
       'itemReviewed_author_0', 'itemReviewed_author_@type',
       'itemReviewed_author_image', 'itemReviewed_author_jobTitle',
       'itemReviewed_author_name', 'itemReviewed_firstAppearance_0',
       'itemReviewed_firstAppearance_@type',
       'itemReviewed_firstAppearance_url', 'sdPublisher_@type',
       'sdPublisher_name', 'sdPublisher_url', 'author_@type', 'author_image',
       'author_name', 'author

In [52]:
df_concat_final = df_concat[['dateCreated', 'url', 'dateModified', 'item_claimReviewed', 'item_datePublished', 
           'item_url', 'reviewRating_alternateName', 'reviewRating_bestRating', 'reviewRating_image',
       'reviewRating_ratingExplanation', 'reviewRating_ratingValue',
       'reviewRating_worstRating', 'author_name', 'author_url', 
           'itemReviewed_firstAppearance_url', 'itemReviewed_datePublished', 'itemReviewed_name', 
           'itemReviewed_author_@type', 'itemReviewed_author_jobTitle',
           'itemReviewed_author_name']]

In [None]:
df_concat_final['datePublished'] = df_concat_final['dateCreated']
df_concat_final.loc[pd.notnull(df_concat_final.dateModified), 'datePublished'] = df_concat_final.loc[pd.notnull(df_concat_final.dateModified), 'dateModified']
df_concat_final.loc[pd.notnull(df_concat_final.dateModified), 'datePublished'] = df_concat_final.loc[pd.notnull(df_concat_final.dateModified), 'item_datePublished']

In [55]:
df_concat_final = df_concat_final[['datePublished', 'url', 'item_claimReviewed', 
           'reviewRating_alternateName', 'reviewRating_bestRating', 'reviewRating_image',
       'reviewRating_ratingExplanation', 'reviewRating_ratingValue',
       'reviewRating_worstRating', 'author_name', 'author_url', 
           'itemReviewed_firstAppearance_url', 'itemReviewed_datePublished', 'itemReviewed_name', 
           'itemReviewed_author_jobTitle', 'itemReviewed_author_name']]

In [None]:
def get_page(url):
    header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/65.0.3325.181 Safari/537.36"}
    content = requests.get(url, headers=header).content
    return content

executor = ThreadPoolExecutor(25)
htmls = list(tqdm(executor.map(get_page, df_concat_final['url']), total=len(df_concat_final['url'])))
pickle.dump(htmls, open('newsmedia_combined_htmls2.pkl', 'wb'))

  0%|          | 0/1935 [00:00<?, ?it/s]

In [None]:
htmls = pickle.load(open('newsmedia_combined_htmls2.pkl', 'rb'))

In [None]:
fc_col = []
for html in tqdm(htmls):
    soup = BeautifulSoup(html, 'lxml')
    fc_list = []
    bin = False
    for i in soup.find_all(type="application/ld+json"):
        if '@type' in json.loads(i.get_text()):
            if json.loads(i.get_text())['@type'] == 'NewsArticle':
                author = json.loads(i.get_text())['author']
                if type(author) == dict:
                    name = author['name']
                    if ', USA TODAY' in name:
                        name = name.replace(', USA TODAY', '')
                    if ' and ' in name:
                        name = name.replace(' and ', ',')
                    if ', ' in name:
                        name = name.replace(', ', ',')
                    fc_col.append(name)
                elif type(author) == list:
                    for j in author:
                        fc_list.append(j['name'])
                    if fc_list[0] == 'The New York Times':
                        fc_col.append(None)
                    else:
                        fc_col.append(','.join(fc_list))
                else:
                    print('error')
                
                bin = True
                break
        else:
            for j in json.loads(i.get_text())['@graph']:
                if j['@type'] == ['Person']:
                    yeah = True
                    author = j
                    if type(author) == dict:
                        name = author['name']
                        if ', USA TODAY' in name:
                            name = name.replace(', USA TODAY', '')
                        if ' and ' in name:
                            name = name.replace(' and ', ',')
                        if ', ' in name:
                            name = name.replace(', ', ',')
                        fc_col.append(name)
                    elif type(author) == list:
                        for j in author:
                            fc_list.append(j['name'])
                        if fc_list[0] == 'The New York Times':
                            fc_col.append(None)
                        else:
                            fc_col.append(','.join(fc_list))
                    else:
                        print('error')
                    bin = True
                    break


    if not bin:
        fc_col.append(None)

  0%|          | 0/1935 [00:00<?, ?it/s]

In [None]:
df_concat_final['factchecker'] = fc_col
df_concat_final.to_csv('df_concat_final.csv', index=False)

In [31]:
df_concat_final = pd.read_csv('df_concat_final.csv')

In [32]:
df_concat_final.head()

Unnamed: 0,datePublished,url,item_claimReviewed,reviewRating_alternateName,reviewRating_bestRating,reviewRating_image,reviewRating_ratingExplanation,reviewRating_ratingValue,reviewRating_worstRating,author_name,author_url,itemReviewed_firstAppearance_url,itemReviewed_datePublished,itemReviewed_name,itemReviewed_author_jobTitle,itemReviewed_author_name,factchecker
0,2022-05-06T15:36:19.682857+00:00,https://www.factcheck.org/2022/05/biden-hasnt-...,"President Joe Biden has ""[s]topped abortion.""",False,10.0,https://www.factstream.co/rails/active_storage...,A leaked draft opinion suggests that the Supre...,6.0,0.0,FactCheck.org,https://www.factcheck.org/,,2022-05-03,,-,Meme,Saranac Hale Spencer
1,2022-05-06T11:56:02Z,https://www.washingtonpost.com/politics/2022/0...,"""In Texas, Republicans passed a law allowing r...",Four Pinocchios,5.0,https://www.factstream.co/rails/active_storage...,The law does not permit rapists to sue their v...,0.0,0.0,Washington Post,https://www.washingtonpost.com/,https://twitter.com/AOC/status/152010807015153...,2022-04-29,in a tweet,Member of the House (D-N.Y.),Alexandria Ocasio-Cortez (,Glenn Kessler
2,2022-05-05T19:57:38.365097+00:00,https://www.factcheck.org/2022/05/desantis-vs-...,“The bonds will be paid by Disney. They will b...,Unclear What Impact Will Be,,,A state Senate analysis of the legislation sai...,,,FactCheck.org,https://www.factcheck.org/,https://rumble.com/v12v53k-florida-and-the-ame...,2022-04-28,Fox News,Florida Governor,Ron DeSantis,D'Angelo Gore
3,2022-05-05T11:33:09.269118+00:00,https://www.washingtonpost.com/politics/2022/0...,“You know what happens to these individuals? T...,Three Pinocchios,5.0,https://www.factstream.co/rails/active_storage...,The numbers disprove his claim,1.0,0.0,Washington Post,https://www.washingtonpost.com/,https://www.washingtonpost.com/politics/2022/0...,2022-05-01,in an interview on Fox News Sunday,Homeland Secretary Secretary,Alejandro Mayorkas,Glenn Kessler
4,2022-05-04T21:50:09.905186+00:00,https://www.factcheck.org/2022/05/unfounded-cl...,"A rash of ""mysterious"" fires at food-processin...",Fires Not Unusual,,,The number of food-processing plant fires in 2...,,,FactCheck.org,https://www.factcheck.org/,,2022-04-21,,-,Social media posts,Saranac Hale Spencer


# Organization-based: Web crawling

## PolitiFact

In [102]:
url_list = []
def get_list(i):
    while True:
        header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/65.0.3325.181 Safari/537.36"}
        prefix = 'https://www.politifact.com'
        url = f'https://www.politifact.com/factchecks/list/?page={i}'
        url_list = []
        listpage = requests.get(url, headers=header)
        listsoup = BeautifulSoup(listpage.content, 'lxml')
        
        for link in listsoup.find_all(class_="m-statement__quote"): 
            url_list.append(prefix + link.select('a')[0]['href'])
        return url_list

executor = ThreadPoolExecutor(10)

url_list += list(tqdm(executor.map(get_list, range(1, 709)), total=708))

  0%|          | 0/708 [00:00<?, ?it/s]

In [104]:
urls = [item for sublist in url_list for item in sublist]

In [None]:
def get_page(url):
    header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/65.0.3325.181 Safari/537.36"}
    content = requests.get(url, headers=header).content
    return content

executor = ThreadPoolExecutor(25)
htmls = list(tqdm(executor.map(get_page, urls), total=len(urls)))

  0%|          | 0/21150 [00:00<?, ?it/s]

In [None]:
pickle.dump(urls, open('politifact_urls.pkl', 'wb'))
pickle.dump(htmls, open('politifact_htmls.pkl', 'wb'))

In [None]:
urls = pickle.load(open('politifact_urls.pkl', 'rb'))
htmls = pickle.load(open('politifact_htmls.pkl', 'rb'))

In [None]:
from concurrent.futures import ProcessPoolExecutor

def process(input):
    url, html = input
    soup_sub = BeautifulSoup(html, 'lxml')
    date = url[url.index('/factchecks/') + len('/factchecks/'):]
    year = date[:date.index('/')]
    date = date[date.index('/')+1:]
    month = date[:date.index('/')]
    month_convert = {'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
                     'may': '05', 'jun': '06', 'jul': '07', 'aug': '08',
                     'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}
    month = month_convert[month]
    date = date[date.index('/')+1:]
    day = date[:date.index('/')]
    datePublished = f'{year}-{month}-{day}'

    context = soup_sub.find(class_='m-statement__desc').get_text().strip()
    context = context[context.index('on ')+len('on '):]
    itemReviewed_datePublished = context[:context.index(', ')+6]
    itemReviewed_name = context[context.index(', ')+6:]
    if ':' in itemReviewed_name:
        itemReviewed_name = itemReviewed_name.replace(':', '')
    try:
        factchecker_list = []
        for factchecker_element in soup_sub.find_all(class_='m-author__content'):
            for factchecker_link in factchecker_element.select('a'):
                factchecker_list.append(factchecker_link.get_text().strip())
    except AttributeError:
        print(url)
        factchecker = None
    
    month_convert = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August':'08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'}
    month = itemReviewed_datePublished[:itemReviewed_datePublished.index(' ')]
    month = month_convert[month]
    day = itemReviewed_datePublished[itemReviewed_datePublished.index(' ')+1:itemReviewed_datePublished.index(', ')]
    year = itemReviewed_datePublished[itemReviewed_datePublished.index(', ')+2:]
    day = format(int(day), '02')
    itemReviewed_datePublished = f'{year}-{month}-{day}'

    rating_dict = {'https://static.politifact.com/politifact/rulings/meter-false.jpg': 'False',
                    'https://static.politifact.com/politifact/rulings/meter-full-flop.jpg': "Full Flop",
                    'https://static.politifact.com/politifact/rulings/meter-half-flip.jpg': 'Half Flip',
                    'https://static.politifact.com/politifact/rulings/meter-half-true.jpg': 'Half True',
                    'https://static.politifact.com/politifact/rulings/meter-mostly-false.jpg': 'Mostly False',
                    'https://static.politifact.com/politifact/rulings/meter-mostly-true.jpg': 'Mostly True',
                    'https://static.politifact.com/politifact/rulings/meter-no-flip.jpg': 'No Flip',
                    'https://static.politifact.com/politifact/rulings/meter-true.jpg': 'True',
                    'https://static.politifact.com/politifact/rulings/tom_ruling_pof.png': 'Pants on Fire'}


    return {'datePublished': datePublished, 
                    'url': url, 
                    'item_claimReviewed': soup_sub.find(class_='m-statement__quote').get_text().strip(),
                    'reviewRating_alternateName': rating_dict[soup_sub.find(property="og:image")['content']],
                    'reviewRating_image': soup_sub.find(property="og:image")['content'],
                    'author_name': 'PolitiFact', 
                    'author_url': 'https://www.politifact.com/', 
                    'itemReviewed_datePublished': itemReviewed_datePublished,
                    'itemReviewed_name': itemReviewed_name.strip(),
                    'itemReviewed_author_name': soup_sub.find(class_='m-statement__name').get_text().strip(),
                    'factchecker': ','.join(factchecker_list)}

executor = ProcessPoolExecutor(8)
results = list(tqdm(executor.map(process, list(zip(urls, htmls))), total=len(urls)))

  0%|          | 0/21150 [00:00<?, ?it/s]

In [None]:
df_politifact = pd.DataFrame(results)

In [None]:
rating_dict = {'https://static.politifact.com/politifact/rulings/meter-false.jpg': 'False',
                'https://static.politifact.com/politifact/rulings/meter-full-flop.jpg': "Full Flop",
                'https://static.politifact.com/politifact/rulings/meter-half-flip.jpg': 'Half Flip',
                'https://static.politifact.com/politifact/rulings/meter-half-true.jpg': 'Half True',
                'https://static.politifact.com/politifact/rulings/meter-mostly-false.jpg': 'Mostly False',
                'https://static.politifact.com/politifact/rulings/meter-mostly-true.jpg': 'Mostly True',
                'https://static.politifact.com/politifact/rulings/meter-no-flip.jpg': 'No Flip',
                'https://static.politifact.com/politifact/rulings/meter-true.jpg': 'True',
                'https://static.politifact.com/politifact/rulings/tom_ruling_pof.png': 'Pants on Fire'}

df_politifact['reviewRating_alternateName'] = df_politifact.reviewRating_image.progress_apply(lambda x: rating_dict[x])

  0%|          | 0/21150 [00:00<?, ?it/s]

In [None]:
df_politifact

Unnamed: 0,datePublished,url,item_claimReviewed,reviewRating_alternateName,reviewRating_image,author_name,author_url,itemReviewed_datePublished,itemReviewed_name,itemReviewed_author_name,factchecker
0,2022-05-07,https://www.politifact.com/factchecks/2022/may...,When the New York State Senate voted to legali...,True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-05-03,in a news conference,Andrea Stewart-Cousins,Jill Terreri Ramos
1,2022-05-06,https://www.politifact.com/factchecks/2022/may...,When there is “a noose on a college dorm of a ...,False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-26,in a radio show segment,Dennis Prager,Bill McCarthy
2,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“If you earn $100 and pay $33 income tax you’r...,Mostly False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-09,in a photo on Facebook,Facebook posts,Sara Swann
3,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“Military Arrests Biden’s Sec. of Agriculture ...,Pants on Fire,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-29,in a story,Bloggers,Samantha Putterman
4,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“New York leads the U.S. in population loss.”,True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-12,in a tweet,Lee Zeldin,"Elizabeth Egan,Marnique O. Panepento"
...,...,...,...,...,...,...,...,...,...,...,...
21145,2007-11-10,https://www.politifact.com/factchecks/2007/nov...,"""All of the records, as far as I know, about w...",Mostly False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-10-30,in Philadelphia,Hillary Clinton,Angie Drobnic Holan
21146,2007-11-10,https://www.politifact.com/factchecks/2007/nov...,"""My husband has not withheld a single document.""",Half True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-11-04,"in Clinton, Iowa",Hillary Clinton,Angie Drobnic Holan
21147,2007-11-09,https://www.politifact.com/factchecks/2007/nov...,"""Our children's safety is potentially at risk ...",Mostly True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-10-29,"in a speech in Manchester, N.H.",John Edwards,Tom Tobin
21148,2007-11-08,https://www.politifact.com/factchecks/2007/nov...,"Obama ""refused to not only put his hand on his...",False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-11-08,in an e-mail circulated by many people.,Chain email,Bill Adair


In [None]:
rating_dict = {'Pants on Fire': 0, 'False': 1, 'Mostly False': 2, 'Half True': 3, 'Mostly True': 4, 'True': 5, 'No Flip': None, 'Half Flip': None, 'Full Flop': None}
df_politifact['reviewRating_ratingValue'] = df_politifact['reviewRating_alternateName'].apply(lambda x: rating_dict[x])
df_politifact['reviewRating_bestRating'] = 5
df_politifact['reviewRating_worstRating'] = 0

In [None]:
df_politifact

Unnamed: 0,datePublished,url,item_claimReviewed,reviewRating_alternateName,reviewRating_image,author_name,author_url,itemReviewed_datePublished,itemReviewed_name,itemReviewed_author_name,factchecker,reviewRating_ratingValue,reviewRating_bestRating,reviewRating_worstRating
0,2022-05-07,https://www.politifact.com/factchecks/2022/may...,When the New York State Senate voted to legali...,True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-05-03,in a news conference,Andrea Stewart-Cousins,Jill Terreri Ramos,5.0,5,0
1,2022-05-06,https://www.politifact.com/factchecks/2022/may...,When there is “a noose on a college dorm of a ...,False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-26,in a radio show segment,Dennis Prager,Bill McCarthy,1.0,5,0
2,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“If you earn $100 and pay $33 income tax you’r...,Mostly False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-09,in a photo on Facebook,Facebook posts,Sara Swann,2.0,5,0
3,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“Military Arrests Biden’s Sec. of Agriculture ...,Pants on Fire,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-29,in a story,Bloggers,Samantha Putterman,0.0,5,0
4,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“New York leads the U.S. in population loss.”,True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-12,in a tweet,Lee Zeldin,"Elizabeth Egan,Marnique O. Panepento",5.0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21145,2007-11-10,https://www.politifact.com/factchecks/2007/nov...,"""All of the records, as far as I know, about w...",Mostly False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-10-30,in Philadelphia,Hillary Clinton,Angie Drobnic Holan,2.0,5,0
21146,2007-11-10,https://www.politifact.com/factchecks/2007/nov...,"""My husband has not withheld a single document.""",Half True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-11-04,"in Clinton, Iowa",Hillary Clinton,Angie Drobnic Holan,3.0,5,0
21147,2007-11-09,https://www.politifact.com/factchecks/2007/nov...,"""Our children's safety is potentially at risk ...",Mostly True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-10-29,"in a speech in Manchester, N.H.",John Edwards,Tom Tobin,4.0,5,0
21148,2007-11-08,https://www.politifact.com/factchecks/2007/nov...,"Obama ""refused to not only put his hand on his...",False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-11-08,in an e-mail circulated by many people.,Chain email,Bill Adair,1.0,5,0


In [190]:
df_politifact.to_csv('df_politifact.csv', index=False)

### Merge with Google data

In [125]:
df_politifact = pd.read_csv('df_politifact.csv')

In [63]:
feed_list = []
for i in tqdm(o['dataFeedElement']):
    try:
        if 'politifact.com' in i['url']:
            feed_list.append(i)
    except:
        pass

  0%|          | 0/36552 [00:00<?, ?it/s]

In [64]:
def use_first(x):
    try:
        return pd.Series(x[0])
    except:
        return pd.Series(dtype='object')

df = pd.DataFrame(feed_list).drop(['@type'], axis=1)
df_item = df['item'].progress_apply(use_first)
df_reviewRating = df_item['reviewRating'].progress_apply(pd.Series).add_prefix('reviewRating_')

df_itemReviewed = df_item['itemReviewed'].progress_apply(pd.Series).add_prefix('itemReviewed_')
df_itemReviewed_appearance = df_itemReviewed['itemReviewed_appearance'].progress_apply(pd.Series).add_prefix('itemReviewed_appearance_')
df_itemReviewed_author = df_itemReviewed['itemReviewed_author'].progress_apply(pd.Series).add_prefix('itemReviewed_author_')
df_itemReviewed_firstAppearance = df_itemReviewed['itemReviewed_firstAppearance'].progress_apply(pd.Series).add_prefix('itemReviewed_firstAppearance_')
df_itemReviewed = df_itemReviewed.drop(['itemReviewed_appearance', 'itemReviewed_author', 'itemReviewed_firstAppearance'], axis=1)

df_itemReviewed = pd.concat([df_itemReviewed, df_itemReviewed_appearance, df_itemReviewed_author, df_itemReviewed_firstAppearance], axis=1)
df_sdPublisher = df_item['sdPublisher'].progress_apply(pd.Series).add_prefix('sdPublisher_')
df_author = df_item['author'].progress_apply(pd.Series).add_prefix('author_')
df_item = df_item.add_prefix('item_')

df_concat = pd.concat([df.drop(['item'], axis=1), 
                       df_item.drop(['item_reviewRating', 'item_itemReviewed', 'item_sdPublisher', 'item_author'], axis=1), 
                       df_reviewRating, df_itemReviewed, df_sdPublisher, df_author], axis=1)

  0%|          | 0/4067 [00:00<?, ?it/s]

  0%|          | 0/4067 [00:00<?, ?it/s]

  0%|          | 0/4067 [00:00<?, ?it/s]

  0%|          | 0/4067 [00:00<?, ?it/s]

  0%|          | 0/4067 [00:00<?, ?it/s]

  0%|          | 0/4067 [00:00<?, ?it/s]

  0%|          | 0/4067 [00:00<?, ?it/s]

  0%|          | 0/4067 [00:00<?, ?it/s]

In [65]:
df_concat_final_politifact = df_concat[['dateCreated', 'url', 'dateModified', 'item_claimReviewed', 'item_datePublished', 
           'item_url', 'reviewRating_alternateName', 'reviewRating_bestRating', 'reviewRating_image',
       'reviewRating_ratingExplanation', 'reviewRating_ratingValue',
       'reviewRating_worstRating', 'author_name', 'author_url', 
           'itemReviewed_firstAppearance_url', 'itemReviewed_datePublished', 'itemReviewed_name', 
           'itemReviewed_author_@type', 'itemReviewed_author_jobTitle',
           'itemReviewed_author_name']]

In [66]:
df_concat_final_politifact['datePublished'] = df_concat_final_politifact['dateCreated']
df_concat_final_politifact.loc[pd.notnull(df_concat_final_politifact.dateModified), 'datePublished'] = df_concat_final_politifact.loc[pd.notnull(df_concat_final_politifact.dateModified), 'dateModified']
df_concat_final_politifact.loc[pd.notnull(df_concat_final_politifact.dateModified), 'datePublished'] = df_concat_final_politifact.loc[pd.notnull(df_concat_final_politifact.dateModified), 'item_datePublished']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [68]:
df_concat_final_politifact = df_concat_final_politifact[['datePublished', 'url', 'item_claimReviewed', 
           'reviewRating_alternateName', 'reviewRating_bestRating', 'reviewRating_image',
       'reviewRating_ratingExplanation', 'reviewRating_ratingValue',
       'reviewRating_worstRating', 'author_name', 'author_url', 
           'itemReviewed_firstAppearance_url', 'itemReviewed_datePublished', 'itemReviewed_name', 
           'itemReviewed_author_jobTitle', 'itemReviewed_author_name']]

In [126]:
to_be_merged = df_concat_final_politifact[['url', 'reviewRating_ratingExplanation', 'itemReviewed_firstAppearance_url', 'itemReviewed_author_jobTitle']]

In [128]:
df_politifact_merged = df_politifact.merge(to_be_merged, on='url', how='left')

In [129]:
df_politifact_merged.to_csv('df_politifact_merged.csv', index=False)

In [130]:
df_politifact_merged

Unnamed: 0,datePublished,url,item_claimReviewed,reviewRating_alternateName,reviewRating_image,author_name,author_url,itemReviewed_datePublished,itemReviewed_name,itemReviewed_author_name,factchecker,reviewRating_ratingValue,reviewRating_bestRating,reviewRating_worstRating,reviewRating_ratingExplanation,itemReviewed_firstAppearance_url,itemReviewed_author_jobTitle
0,2022-05-07,https://www.politifact.com/factchecks/2022/may...,When the New York State Senate voted to legali...,True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-05-03,in a news conference,Andrea Stewart-Cousins,Jill Terreri Ramos,5.0,5,0,"In 1970, when New York state legalized abortio...",https://www.facebook.com/watch/?v=326031746270...,New York State Senate Majority Leader
1,2022-05-06,https://www.politifact.com/factchecks/2022/may...,When there is “a noose on a college dorm of a ...,False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-26,in a radio show segment,Dennis Prager,Bill McCarthy,1.0,5,0,,,Radio host
2,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“If you earn $100 and pay $33 income tax you’r...,Mostly False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-09,in a photo on Facebook,Facebook posts,Sara Swann,2.0,5,0,,,
3,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“Military Arrests Biden’s Sec. of Agriculture ...,Pants on Fire,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-29,in a story,Bloggers,Samantha Putterman,0.0,5,0,,,
4,2022-05-06,https://www.politifact.com/factchecks/2022/may...,“New York leads the U.S. in population loss.”,True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2022-04-12,in a tweet,Lee Zeldin,"Elizabeth Egan,Marnique O. Panepento",5.0,5,0,Census data shows that New York’s population d...,https://twitter.com/leezeldin/status/151385351...,"Gubernatorial candidate, R-N.Y."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21145,2007-11-10,https://www.politifact.com/factchecks/2007/nov...,"""All of the records, as far as I know, about w...",Mostly False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-10-30,in Philadelphia,Hillary Clinton,Angie Drobnic Holan,2.0,5,0,,,
21146,2007-11-10,https://www.politifact.com/factchecks/2007/nov...,"""My husband has not withheld a single document.""",Half True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-11-04,"in Clinton, Iowa",Hillary Clinton,Angie Drobnic Holan,3.0,5,0,,,
21147,2007-11-09,https://www.politifact.com/factchecks/2007/nov...,"""Our children's safety is potentially at risk ...",Mostly True,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-10-29,"in a speech in Manchester, N.H.",John Edwards,Tom Tobin,4.0,5,0,,,
21148,2007-11-08,https://www.politifact.com/factchecks/2007/nov...,"Obama ""refused to not only put his hand on his...",False,https://static.politifact.com/politifact/rulin...,PolitiFact,https://www.politifact.com/,2007-11-08,in an e-mail circulated by many people.,Chain email,Bill Adair,1.0,5,0,,,


## Lead stories

In [None]:
url_list = []
def get_list(i):
    while True:
        header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/65.0.3325.181 Safari/537.36"}
        prefix = 'https://leadstories.com/hoax-alert/'
        url_list = []
        url = prefix + i
        listpage = requests.get(url, headers=header)
        listsoup = BeautifulSoup(listpage.content, 'lxml')
        for link in listsoup.select('a')[5:]:
            url_list.append(prefix + i + link['href'])
        return url_list

executor = ThreadPoolExecutor(10)

ym = []
for y in range(2018, 2023):
    if y == 2022:
        for m in range(1,6):
            ym.append(str(y)+'/'+str(m)+'/')
    else:
        for m in range(1,13):
            ym.append(str(y)+'/'+str(m)+'/')
url_list += list(tqdm(executor.map(get_list, ym), total=len(ym)))

In [None]:
urls = [item for sublist in url_list for item in sublist]

In [None]:
def get_page(url):
    header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/65.0.3325.181 Safari/537.36"}
    content = requests.get(url, headers=header).content
    return content

executor = ThreadPoolExecutor(25)
htmls = list(tqdm(executor.map(get_page, urls), total=len(urls)))

  0%|          | 0/8843 [00:00<?, ?it/s]

In [None]:
pickle.dump(urls, open('leadstories_urls.pkl', 'wb'))
pickle.dump(htmls, open('leadstories_htmls.pkl', 'wb'))

In [6]:
urls = pickle.load(open('leadstories_urls.pkl', 'rb'))
htmls = pickle.load(open('leadstories_htmls.pkl', 'rb'))

In [20]:
from concurrent.futures import ProcessPoolExecutor

feed_list = []
for url, html in tqdm(list(zip(urls, htmls))):
    soup_sub = BeautifulSoup(html, 'lxml')
    for i in soup_sub.find_all(type='application/ld+json'):
        if 'ClaimReview' in str(i):
            try:
                data = json.loads(i.get_text())
            except:
                data = {}
    try:
        feed_list.append(data)
    except:
        feed_list.append({})

  0%|          | 0/8843 [00:00<?, ?it/s]

In [22]:
feed_list[0]

{'@context': 'http://schema.org',
 '@type': 'ClaimReview',
 'author': {'@type': 'Organization',
  'name': 'Lead Stories LLC',
  'url': 'https://leadstories.com'},
 'claimReviewed': 'Ukrainian troops Attack Russia armored vehicle Convoy with Dozens of UK sent Tanks',
 'datePublished': '2022-05-04T13:40:00-07:00',
 'description': 'Did Ukrainian troops attack a Russian armored vehicle convoy with dozens of "U.K.-sent tanks"? No, that\'s not true: The video posted, which is filled with footage of tanks, has absolutely no footage of Ukrainian troops attacking a Russian convoy. The...',
 'itemReviewed': {'@type': 'CreativeWork',
  'author': {'@type': 'Organization',
   'name': 'facebook.com',
   'sameAs': ['https://www.facebook.com/watch/?v=1062136197982702']},
  'datePublished': '2022-05-04T13:40:00-07:00',
  'keywords': ''},
 'reviewRating': {'@type': 'Rating',
  'alternateName': 'No Such Attack',
  'bestRating': '5',
  'ratingValue': '1',
  'worstRating': '1'},
 'url': 'https://leadstorie

In [29]:
df_item = pd.DataFrame(feed_list).drop(['@type'], axis=1)
df_reviewRating = df_item['reviewRating'].progress_apply(pd.Series).add_prefix('reviewRating_')

df_itemReviewed = df_item['itemReviewed'].progress_apply(pd.Series).add_prefix('itemReviewed_')
df_itemReviewed_author = df_itemReviewed['itemReviewed_author'].progress_apply(pd.Series).add_prefix('itemReviewed_author_')
df_itemReviewed = df_itemReviewed.drop('itemReviewed_author', axis=1)

df_itemReviewed = pd.concat([df_itemReviewed, df_itemReviewed_author], axis=1)
df_author = df_item['author'].progress_apply(pd.Series).add_prefix('author_')
df_item = df_item.add_prefix('item_')

df_concat_lead_stories = pd.concat([df_item.drop(['item_reviewRating', 'item_itemReviewed', 'item_author'], axis=1), 
                       df_reviewRating, df_itemReviewed, df_author], axis=1)

  0%|          | 0/8843 [00:00<?, ?it/s]

  0%|          | 0/8843 [00:00<?, ?it/s]

  0%|          | 0/8843 [00:00<?, ?it/s]

  0%|          | 0/8843 [00:00<?, ?it/s]

In [40]:
fc_col = []
for html in tqdm(htmls):
    soup = BeautifulSoup(html, 'lxml')
    fc_list = []
    bin = False
    for i in soup.find_all(type="application/ld+json"):
        if '@type' in json.loads(i.get_text()):
            if json.loads(i.get_text())['@type'] == 'NewsArticle':
                author = json.loads(i.get_text())['author']
                if type(author) == dict:
                    name = author['name']
                    if ', USA TODAY' in name:
                        name = name.replace(', USA TODAY', '')
                    if ' and ' in name:
                        name = name.replace(' and ', ',')
                    if ', ' in name:
                        name = name.replace(', ', ',')
                    fc_col.append(name)
                elif type(author) == list:
                    for j in author:
                        fc_list.append(j['name'])
                    if fc_list[0] == 'The New York Times':
                        fc_col.append(None)
                    else:
                        fc_col.append(','.join(fc_list))
                else:
                    print('error')
                
                bin = True
                break
        else:
            for j in json.loads(i.get_text())['@graph']:
                if j['@type'] == ['Person']:
                    yeah = True
                    author = j
                    if type(author) == dict:
                        name = author['name']
                        if ', USA TODAY' in name:
                            name = name.replace(', USA TODAY', '')
                        if ' and ' in name:
                            name = name.replace(' and ', ',')
                        if ', ' in name:
                            name = name.replace(', ', ',')
                        fc_col.append(name)
                    elif type(author) == list:
                        for j in author:
                            fc_list.append(j['name'])
                        if fc_list[0] == 'The New York Times':
                            fc_col.append(None)
                        else:
                            fc_col.append(','.join(fc_list))
                    else:
                        print('error')
                    bin = True
                    break


    if not bin:
        fc_col.append(None)

  0%|          | 0/8843 [00:00<?, ?it/s]

In [41]:
df_concat_lead_stories['factchecker'] = fc_col
df_concat_lead_stories.to_csv('df_concat_lead_stories.csv', index=False)

In [42]:
df_concat_lead_stories

Unnamed: 0,item_@context,item_datePublished,item_url,item_description,item_claimReviewed,reviewRating_0,reviewRating_@type,reviewRating_alternateName,reviewRating_bestRating,reviewRating_ratingValue,...,itemReviewed_author_0,itemReviewed_author_@type,itemReviewed_author_name,itemReviewed_author_sameAs,author_0,author_@type,author_Name,author_name,author_url,factchecker
0,http://schema.org,2022-05-04T13:40:00-07:00,https://leadstories.com/hoax-alert/2022/05/fac...,Did Ukrainian troops attack a Russian armored ...,Ukrainian troops Attack Russia armored vehicle...,,Rating,No Such Attack,5,1,...,,Organization,facebook.com,[https://www.facebook.com/watch/?v=10621361979...,,Organization,,Lead Stories LLC,https://leadstories.com,Maarten Schenk
1,http://schema.org,2022-05-04T13:40:00-07:00,https://leadstories.com/hoax-alert/2022/05/fac...,Did Ukrainian troops attack a Russian armored ...,Ukrainian troops Attack Russia armored vehicle...,,Rating,No Such Attack,5,1,...,,Organization,facebook.com,[https://www.facebook.com/watch/?v=10621361979...,,Organization,,Lead Stories LLC,https://leadstories.com,Maarten Schenk
2,http://schema.org,2018-03-18T12:26:26-07:00,https://leadstories.com/hoax-alert/2018/03/fak...,"Did 21-year-old Alexandra Fox from Cincinnati,...",21-year-old nymphomaniac stalked hundreds for ...,,Rating,False,5,1,...,,Organization,worldnewsdailyreport.com,[http://worldnewsdailyreport.com/21-year-old-n...,,Organization,,Lead Stories LLC,https://leadstories.com,Maarten Schenk
3,http://schema.org,2018-03-25T10:27:12-07:00,https://leadstories.com/hoax-alert/2018/03/fak...,Bad doggie! No 101-year-old woman gave birth t...,101-Year-Old Woman Gives Birth To Her 17th Bab...,,Rating,False,5,1,...,,Organization,puppieswoof.com,[https://www.puppieswoof.com/video.php?id=1016],,Organization,,Lead Stories LLC,https://leadstories.com,Maarten Schenk
4,http://schema.org,2018-03-27T14:49:31-07:00,https://leadstories.com/hoax-alert/2018/03/fak...,Are 500 million red bulls killed each year to ...,500 Million Red Bulls Slaughtered Annually To ...,,Rating,False,5,1,...,,Organization,waterfordwhispersnews.com,[http://waterfordwhispersnews.com/2018/03/27/5...,,Organization,,Lead Stories LLC,https://leadstories.com,Maarten Schenk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8838,http://schema.org,2022-05-04T10:26:00-07:00,https://leadstories.com/hoax-alert/2022/05/fac...,Did a demonstration show that washing fresh st...,After a quick soak in tap water the strawberry...,,Rating,Misrepresented,5,1,...,,Organization,i.imgur.com,[https://i.imgur.com/qiXe23J.jpg],,Organization,,Lead Stories LLC,https://leadstories.com,Sarah Thompson
8839,http://schema.org,2022-05-04T10:42:00-07:00,https://leadstories.com/hoax-alert/2022/05/fac...,Did a Ukrainian drone drop dozens of bombs and...,Ukraine Dropped 'Dozens' Of Bombs From Drone O...,,Rating,Didn't Happen,5,1,...,,Organization,facebook.com,[https://www.facebook.com/102439232348511/vide...,,Organization,,Lead Stories LLC,https://leadstories.com,Christiana Dillard
8840,http://schema.org,2022-05-03T15:50:00-07:00,https://leadstories.com/hoax-alert/2022/05/fac...,Did the U.S. military destroy a Moderna COVID-...,Military Destroys Vaccine Warehouse | Real Raw...,,Rating,Phony Story,5,1,...,,Organization,realrawnews.com,[https://realrawnews.com/2022/05/military-dest...,,Organization,,Lead Stories LLC,https://leadstories.com,Alexis Tereszcuk
8841,http://schema.org,2022-05-03T11:37:00-07:00,https://leadstories.com/hoax-alert/2022/05/fac...,Are the ingredients put in vaccines toxic to p...,The ingredients put in vaccines are toxic to p...,,Rating,Ingredients OK,5,1,...,,Organization,facebook.com,[https://www.facebook.com/watch/?v=70107335072...,,Organization,,Lead Stories LLC,https://leadstories.com,Ed Payne


In [46]:
feed_list[0]

{'@context': 'https://schema.org',
 '@type': 'ClaimReview',
 'author': {'@type': 'Organization',
  'name': 'Newsweek',
  'url': 'https://www.newsweek.com'},
 'claimReviewed': "Asset Manager Blackstone Bought Americans' DNA",
 'datePublished': '2022-05-06T11:12:12-04:00',
 'itemReviewed': {'@type': 'Claim',
  'appearance': {'url': 'https://www.newsweek.com/fact-check-did-asset-manager-blackstone-buy-americans-dna-1704285'},
  'author': {'@type': 'Person', 'name': 'Reddit post'},
  'datePublished': '2022-05-06T11:12:12-04:00',
  'firstAppearance': 'https://www.newsweek.com/fact-check-did-asset-manager-blackstone-buy-americans-dna-1704285'},
 'reviewRating': {'@type': 'Rating', 'alternateName': 'False'},
 'url': 'https://www.newsweek.com/fact-check-did-asset-manager-blackstone-buy-americans-dna-1704285'}

## Check Your Fact

In [394]:
url_list = []
def get_list(i):
    while True:
        header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/65.0.3325.181 Safari/537.36"}
        url = f'https://checkyourfact.com/page/{i}/'
        url_list = []
        prefix = 'https://checkyourfact.com'
        listpage = requests.get(url, headers=header)
        listsoup = BeautifulSoup(listpage.content, 'lxml')
        for link in listsoup.select('articles')[0].select('a'):
            url_list.append(prefix + link['href'])
        return url_list

executor = ThreadPoolExecutor(25)

url_list += list(tqdm(executor.map(get_list, range(1, 165)), total=164))

  0%|          | 0/164 [00:00<?, ?it/s]

In [395]:
urls = [item for sublist in url_list for item in sublist]

In [396]:
len(urls)

3277

In [397]:
def get_page(url):
    header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/65.0.3325.181 Safari/537.36"}
    content = requests.get(url, headers=header).content
    return content

executor = ThreadPoolExecutor(25)
htmls = list(tqdm(executor.map(get_page, urls), total=len(urls)))

  0%|          | 0/3277 [00:00<?, ?it/s]

In [398]:
pickle.dump(urls, open('checkyourfact_urls.pkl', 'wb'))
pickle.dump(htmls, open('checkyourfact_htmls.pkl', 'wb'))

In [399]:
urls = pickle.load(open('checkyourfact_urls.pkl', 'rb'))
htmls = pickle.load(open('checkyourfact_htmls.pkl', 'rb'))

In [400]:
from concurrent.futures import ProcessPoolExecutor

feed_list = []
for url, html in tqdm(list(zip(urls, htmls))):
    soup_sub = BeautifulSoup(html, 'lxml')
    for i in soup_sub.find_all(type='application/ld+json'):
        if 'ClaimReview' in str(i):
            try:
                data = json.loads(i.get_text())
            except:
                data = {}
    try:
        feed_list.append(data)
    except:
        feed_list.append({})

  0%|          | 0/3277 [00:00<?, ?it/s]

In [401]:
feed_list[0]

{'@context': 'https://schema.org',
 '@type': 'ClaimReview',
 'author': {'@type': 'Organization',
  'name': 'Check Your Fact',
  'url': 'https://checkyourfact.com'},
 'claimReviewed': 'claims Plan B has been banned in Missouri.',
 'datePublished': '2022-05-09T22:00:14.000Z',
 'itemReviewed': {'@type': 'Claim',
  'appearance': 'https://www.facebook.com/1taysavfdb/posts/1038569063763230',
  'author': {'@type': 'Organization',
   'name': 'Viral Post',
   'sameAs': 'https://www.facebook.com/1taysavfdb/posts/1038569063763230'},
  'datePublished': '2022-05-08'},
 'reviewRating': {'@type': 'Rating',
  'alternateName': 'False',
  'bestRating': '3',
  'ratingValue': '1',
  'worstRating': '1'},
 'url': 'http://checkyourfact.com/2022/05/09/fact-check-plan-b-ban-missouri/'}

In [402]:
df_item = pd.DataFrame(feed_list).drop(['@type'], axis=1)
df_reviewRating = df_item['reviewRating'].progress_apply(pd.Series).add_prefix('reviewRating_')

df_itemReviewed = df_item['itemReviewed'].progress_apply(pd.Series).add_prefix('itemReviewed_')
df_itemReviewed_author = df_itemReviewed['itemReviewed_author'].progress_apply(pd.Series).add_prefix('itemReviewed_author_')
df_itemReviewed = df_itemReviewed.drop('itemReviewed_author', axis=1)

df_itemReviewed = pd.concat([df_itemReviewed, df_itemReviewed_author], axis=1)
df_author = df_item['author'].progress_apply(pd.Series).add_prefix('author_')
df_item = df_item.add_prefix('item_')

df_concat_checkyourfact = pd.concat([df_item.drop(['item_reviewRating', 'item_itemReviewed', 'item_author'], axis=1), 
                       df_reviewRating, df_itemReviewed, df_author], axis=1).drop('itemReviewed_appearance', axis=1)

  0%|          | 0/3277 [00:00<?, ?it/s]

  0%|          | 0/3277 [00:00<?, ?it/s]

  0%|          | 0/3277 [00:00<?, ?it/s]

  0%|          | 0/3277 [00:00<?, ?it/s]

In [405]:
fc_col = []
for html in tqdm(htmls):
    soup = BeautifulSoup(html, 'lxml')
    fc_list = []
    bin = False
    for i in soup.find_all(type="application/ld+json"):
        try:
            if '@type' in json.loads(i.get_text()):
                if json.loads(i.get_text())['@type'] == 'NewsArticle':
                    author = json.loads(i.get_text())['author']
                    if type(author) == dict:
                        name = author['name']
                        if ', USA TODAY' in name:
                            name = name.replace(', USA TODAY', '')
                        if ' and ' in name:
                            name = name.replace(' and ', ',')
                        if ', ' in name:
                            name = name.replace(', ', ',')
                        fc_col.append(name)
                    elif type(author) == list:
                        for j in author:
                            fc_list.append(j['name'])
                        if fc_list[0] == 'The New York Times':
                            fc_col.append(None)
                        else:
                            fc_col.append(','.join(fc_list))
                    else:
                        print('error')
                    
                    bin = True
                    break
            else:
                for j in json.loads(i.get_text())['@graph']:
                    if j['@type'] == ['Person']:
                        yeah = True
                        author = j
                        if type(author) == dict:
                            name = author['name']
                            if ', USA TODAY' in name:
                                name = name.replace(', USA TODAY', '')
                            if ' and ' in name:
                                name = name.replace(' and ', ',')
                            if ', ' in name:
                                name = name.replace(', ', ',')
                            fc_col.append(name)
                        elif type(author) == list:
                            for j in author:
                                fc_list.append(j['name'])
                            if fc_list[0] == 'The New York Times':
                                fc_col.append(None)
                            else:
                                fc_col.append(','.join(fc_list))
                        else:
                            print('error')
                        bin = True
                        break
        except:
            pass

    if not bin:
        fc_col.append(None)

  0%|          | 0/3277 [00:00<?, ?it/s]

In [407]:
df_concat_checkyourfact['factchecker'] = fc_col

In [409]:
df_concat_checkyourfact.to_csv('df_concat_checkyourfact.csv', index=False)

## Newsweek

In [410]:
url_list = []
def get_list(i):
    while True:
        header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                            "AppleWebKit/537.36 (KHTML, like Gecko) "
                            "Chrome/65.0.3325.181 Safari/537.36"}
        url = f'https://www.newsweek.com/topic/fact-check?page={i}'
        url_list = []
        prefix = 'https://www.newsweek.com'
        listpage = requests.get(url, headers=header)
        listsoup = BeautifulSoup(listpage.content, 'lxml')
        for link in listsoup.select('article'):
            url_list.append(prefix + link.select('a')[0]['href'])
        return url_list

executor = ThreadPoolExecutor(25)

url_list += list(tqdm(executor.map(get_list, range(1, 14)), total=13))

  0%|          | 0/13 [00:00<?, ?it/s]

In [411]:
urls = [item for sublist in url_list for item in sublist]

In [412]:
len(urls)

361

In [413]:
def get_page(url):
    header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/65.0.3325.181 Safari/537.36"}
    content = requests.get(url, headers=header).content
    return content

executor = ThreadPoolExecutor(25)
htmls = list(tqdm(executor.map(get_page, urls), total=len(urls)))

  0%|          | 0/361 [00:00<?, ?it/s]

In [414]:
pickle.dump(urls, open('newsweek_urls.pkl', 'wb'))
pickle.dump(htmls, open('newsweek_htmls.pkl', 'wb'))

In [415]:
urls = pickle.load(open('newsweek_urls.pkl', 'rb'))
htmls = pickle.load(open('newsweek_htmls.pkl', 'rb'))

In [416]:
from concurrent.futures import ProcessPoolExecutor

feed_list = []
for url, html in tqdm(list(zip(urls, htmls))):
    soup_sub = BeautifulSoup(html, 'lxml')
    for i in soup_sub.find_all(type='application/ld+json'):
        if 'ClaimReview' in str(i):
            try:
                data = json.loads(i.get_text())
            except:
                data = {}
    try:
        feed_list.append(data)
    except:
        feed_list.append({})

  0%|          | 0/361 [00:00<?, ?it/s]

In [417]:
feed_list[50]

{'@context': 'https://schema.org',
 '@type': 'ClaimReview',
 'author': {'@type': 'Organization',
  'name': 'Newsweek',
  'url': 'https://www.newsweek.com'},
 'claimReviewed': 'Kindergarteners in Portland Forced to Eat Outside Due to COVID Measures',
 'datePublished': '2021-12-13T14:23:26-05:00',
 'itemReviewed': {'@type': 'Claim',
  'appearance': {'url': 'https://www.newsweek.com/fact-check-were-kindergarteners-portland-forced-eat-outside-due-covid-measures-1658773'},
  'author': {'@type': 'Person', 'name': 'Katie Daviscourt'},
  'datePublished': '2021-12-13T14:23:26-05:00',
  'firstAppearance': 'https://www.newsweek.com/fact-check-were-kindergarteners-portland-forced-eat-outside-due-covid-measures-1658773'},
 'reviewRating': {'@type': 'Rating', 'alternateName': 'Mostly true'},
 'url': 'https://www.newsweek.com/fact-check-were-kindergarteners-portland-forced-eat-outside-due-covid-measures-1658773'}

In [418]:
df_item = pd.DataFrame(feed_list).drop(['@type'], axis=1)
df_reviewRating = df_item['reviewRating'].progress_apply(pd.Series).add_prefix('reviewRating_')

df_itemReviewed = df_item['itemReviewed'].progress_apply(pd.Series).add_prefix('itemReviewed_')
df_itemReviewed_author = df_itemReviewed['itemReviewed_author'].progress_apply(pd.Series).add_prefix('itemReviewed_author_')
df_itemReviewed = df_itemReviewed.drop('itemReviewed_author', axis=1)

df_itemReviewed = pd.concat([df_itemReviewed, df_itemReviewed_author], axis=1)
df_author = df_item['author'].progress_apply(pd.Series).add_prefix('author_')
df_item = df_item.add_prefix('item_')

df_concat_newsweek = pd.concat([df_item.drop(['item_reviewRating', 'item_itemReviewed', 'item_author'], axis=1), 
                       df_reviewRating, df_itemReviewed, df_author], axis=1).drop('itemReviewed_appearance', axis=1)

  0%|          | 0/361 [00:00<?, ?it/s]

  0%|          | 0/361 [00:00<?, ?it/s]

  0%|          | 0/361 [00:00<?, ?it/s]

  0%|          | 0/361 [00:00<?, ?it/s]

In [419]:
df_concat_newsweek = df_concat_newsweek.drop(['itemReviewed_firstAppearance'], axis=1)

In [420]:
fc_col = []
for html in tqdm(htmls):
    soup = BeautifulSoup(html, 'lxml')
    fc_list = []
    bin = False
    for i in soup.find_all(type="application/ld+json"):
        try:
            if '@type' in json.loads(i.get_text()):
                if json.loads(i.get_text())['@type'] == 'NewsArticle':
                    author = json.loads(i.get_text())['author']
                    if type(author) == dict:
                        name = author['name']
                        if ', USA TODAY' in name:
                            name = name.replace(', USA TODAY', '')
                        if ' and ' in name:
                            name = name.replace(' and ', ',')
                        if ', ' in name:
                            name = name.replace(', ', ',')
                        fc_col.append(name)
                    elif type(author) == list:
                        for j in author:
                            fc_list.append(j['name'])
                        if fc_list[0] == 'The New York Times':
                            fc_col.append(None)
                        else:
                            fc_col.append(','.join(fc_list))
                    else:
                        print('error')
                    
                    bin = True
                    break
            else:
                for j in json.loads(i.get_text())['@graph']:
                    if j['@type'] == ['Person']:
                        yeah = True
                        author = j
                        if type(author) == dict:
                            name = author['name']
                            if ', USA TODAY' in name:
                                name = name.replace(', USA TODAY', '')
                            if ' and ' in name:
                                name = name.replace(' and ', ',')
                            if ', ' in name:
                                name = name.replace(', ', ',')
                            fc_col.append(name)
                        elif type(author) == list:
                            for j in author:
                                fc_list.append(j['name'])
                            if fc_list[0] == 'The New York Times':
                                fc_col.append(None)
                            else:
                                fc_col.append(','.join(fc_list))
                        else:
                            print('error')
                        bin = True
                        break
        except:
            pass

    if not bin:
        fc_col.append(None)

  0%|          | 0/361 [00:00<?, ?it/s]

In [423]:
df_concat_newsweek['factchecker'] = fc_col

In [424]:
df_concat_newsweek.to_csv('df_concat_newsweek.csv', index=False)

## Snopes (not used)

In [None]:
url_list = []
def get_list(i):
    header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                           "AppleWebKit/537.36 (KHTML, like Gecko) "
                           "Chrome/65.0.3325.181 Safari/537.36"}
    url_list = []
    url = f'https://www.snopes.com/fact-check/page/{i}/'
    listpage = requests.get(url, headers=header)
    listsoup = BeautifulSoup(listpage.content, 'lxml')
    for link in listsoup.select('div.card.list-archive')[0].find_all(class_='stretched-link'):
        url_list.append(link['href'])
    return url_list

executor = ThreadPoolExecutor(25)
url_list += list(tqdm(executor.map(get_list, range(1,1550)), total=1549)) #1550

In [None]:
urls = [item for sublist in url_list for item in sublist]

In [None]:
def get_page(url):
    header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/65.0.3325.181 Safari/537.36"}
    time.sleep(5)
    
    while True:
        content = requests.get(url, headers=header).content
        if b'Request blocked.' not in content:
            return content
        time.sleep(610)

executor = ThreadPoolExecutor(25)
htmls = list(tqdm(executor.map(get_page, urls), total=len(urls)))

  0%|          | 0/18587 [00:00<?, ?it/s]

In [None]:
# pickle.dump(htmls, open('snope_htmls.pkl', 'wb'))

In [None]:
urls = pickle.load(open('snope_urls.pkl', 'rb'))
htmls = pickle.load(open('snope_htmls.pkl', 'rb'))

In [None]:
soup = BeautifulSoup(htmls[1], 'lxml') 

#Organization-based: Aggregate data

In [437]:
label_dict = {' U.S. Groups Donated Supplies': 'Other',
 '1/8 Inch PER YEAR': 'Other',
 '10-Year Average Tax Cut': 'Other',
 '6% is too low': 'Other',
 'A Matter of Dispute': 'Other',
 'A flawed survey is yielding misleading news stories. ': 'Other',
 'Accurate': 'Other',
 'Among Loan Holders Only': 'Other',
 'Ample Evidence for Probe': 'Other',
 'Answer: Yes, Clinton': 'Other',
 'Baseless Claim': 'Other',
 'Baseless claim': 'Other',
 'Both Have Similar Policies': 'Other',
 'CBO Disagrees': 'Other',
 "CDC Didn't Say That": 'Other',
 'CDC Endorses Four Doses': 'Other',
 'CDC: Get Flu Shot': 'Other',
 'CHIP is funded through 2023': 'Other',
 'COVID-19 Origins Remain Unknown': 'Other',
 "Can't for Domestic Group": 'Other',
 'Cases Are Rising': 'Other',
 'Cherry Picks': 'False',
 'Cherry-picked number': 'False',
 'Conspiracy theory': 'Other',
 'Contradicts Past Remarks': 'Other',
 'Correct': 'Other',
 'Counter To Climate Science': 'Other',
 'DOJ: Killed, Not Murdered': 'Other',
 'Deforma los hechos': 'False',
 'Dems Want More Funding': 'Other',
 "Depends How It's Measured": 'Other',
 "Depends on Who's Counting": 'Other',
 'Despite anecdotal evidence, seasonal flu always peaks around December and January, and the flu shot is more effective than last year. Zhao is spouting propaganda. ': 'Other',
 'Devin Nunes was lying about his earlier statements.': 'Other',
 "Didn't Fund That Project": 'Other',
 'Disputed': 'False',
 'Disputed Job Figure': 'Other',
 'Disputed by Experts': 'Other',
 'Distorsiona los hechos': 'False',
 'Distorsiona los hechos ': 'False',
 'Distorts the Facts': 'False',
 'DoD Certified Per Law': 'Other',
 'Donald Trump repeatedly downplayed the threat of coronavirus. He is lying. ': 'Other',
 'Doubtful': 'False',
 "Echoes Russia's Claims": 'Other',
 'Economy Far From V-shaped': 'Other',
 'Editing Distorts Meaning': 'Other',
 'Engañoso': 'False',
 'Engañoso ': 'False',
 'Estimate Known Since March 16': 'Other',
 'Estimates vary.': 'Other',
 'Estudio No Dice Eso': 'Other',
 'Events Proved Him Wrong': 'Other',
 'Evidence Is Slim': 'Other',
 'Evidence Unclear ': 'Other',
 'Evidence exists': 'Other',
 'Evidence gets stronger ': 'Other',
 'Exaggerated ': 'Other',
 'Exaggerates': 'False',
 'Experts Agree': 'Other',
 'Experts Are Skeptical': 'Other',
 'Experts Disagree': 'False',
 'Experts Divided': 'Other',
 'Experts Have Issued Warnings': 'Other',
 'Experts Question That': 'Other',
 'Experts Say Bad Advice': 'Other',
 'Experts are skeptical': 'Other',
 'Experts: Paper Is Faulty': 'Other',
 'FALSE': 'Other',
 'Facile comparison': 'Other',
 'Facts Contradict His Argument': 'Other',
 'False': 'False',
 'False ': 'Other',
 'False.': 'False',
 'False. ': 'False',
 'False. Not All Travel. ': 'Other',
 'Falso': 'False',
 'Falso ': 'False',
 'Far Short of Promise': 'Other',
 'Fauci Files Annual Reports': 'Other',
 'Federal data suggests that very few U.S. babies are born alive as a result of a failed abortion. The Centers for Disease Control and Prevention recorded 143 deaths between 2003 and 2014 involving infants born alive during attempted abortions.': 'Other',
 'Fires Not Unusual': 'Other',
 'Flip flop': 'Other',
 'Four Pinocchios': 'False',
 'Full Flop': 'Other',
 'Half Flip': 'Other',
 'Half True': 'False',
 'Half true': 'False',
 "Here's What He's Said": 'Other',
 "Here's What Trump Said": 'Other',
 'Historians doubt': 'Other',
 'In dispute': 'False',
 'Incorrect': 'Other',
 'Initial Reports Were Wrong': 'Other',
 'It Ended April 2020': 'Other',
 'It could. ': 'Other',
 "It's a doctored video": 'Other',
 "Kids Can't Get J&J Shot": 'Other',
 'King\'s assertion is mixed. MSNBC did report that Democratic leaders did talk to Michael Bloomberg about stepping aside, but that is a common practice and does not amount to "interference."': 'Other',
 'Lacks Context': 'False',
 'Lacks Context ': 'Other',
 'Lacks Evidence': 'Other',
 'Lacks context': 'False',
 'Law Is Being Followed': 'Other',
 'Likely Between $1.3-$3.3 Million': 'Other',
 'MISSING CONTEXT': 'Other',
 'Mail-in Same As Absentee': 'Other',
 'Matter of Dispute': 'Other',
 'Maybe.': 'Other',
 'Migrants Not Driving Surge': 'Other',
 'Misleading': 'False',
 'Misleading headlines created confusion.': 'Other',
 'Misleading/cropped': 'Other',
 'Missing Context': 'False',
 'Missing context': 'False',
 'Mixed': 'False',
 'Mixed.': 'False',
 'Mixed: Biden warned about coronavirus in January but continued to hold rallies and other events until early March': 'Other',
 'More Studies Needed': 'Other',
 'Mostly False': 'False',
 'Mostly True': 'True',
 'Mostly false': 'False',
 'Mostly false.': 'False',
 'Mostly true': 'True',
 'Mugshots are from 2017': 'Other',
 'NSC says that\'s "fake"': 'Other',
 'Needs context': 'False',
 'No Evidence': 'False',
 'No Evidence It Works': 'Other',
 'No Evidence from Lab': 'Other',
 'No Flip': 'Other',
 'No Link Between Them': 'Other',
 'No Official Tally': 'Other',
 'No evidence': 'False',
 'No hay evidencia': 'False',
 'No hay evidencia ': 'False',
 'No hay pruebas': 'False',
 'Nosotros explicamos como': 'Other',
 'Not Based in Science': 'Other',
 'Not His Quote': 'Other',
 'Not Per CDC Data': 'Other',
 'Not What Advisers Say': 'Other',
 'Not What Data Show': 'False',
 'Not What Evidence Shows': 'Other',
 'Not What Study Found': 'Other',
 'Not Worst Per Capita': 'Other',
 'Not Yet Clear': 'Other',
 'Not based on science': 'Other',
 'Not caused by vaccine': 'Other',
 'Not exactly.': 'Other',
 'Not the Whole Story': 'False',
 'Not the whole story': 'False',
 'Obama Advisers Dispute That': 'Other',
 'One Pinocchios': 'Other',
 'Online Surveys, Not Polls': 'Other',
 'Origen COVID-19 aún es incierto': 'Other',
 'Out of Context': 'False',
 'Outdated Figure': 'Other',
 'Overall funding to the CDC has increased each year Donald Trump has been in office.': 'Other',
 'Pants on Fire': 'False',
 'Partially false': 'False',
 'Partially false.': 'Other',
 'Partly False': 'False',
 'Partly False: They do have a stockpile': 'Other',
 'Partly True ': 'Other',
 'Partly false': 'False',
 'Partly false.': 'Other',
 'Peter Lewis is dead': 'Other',
 'President Trump does have investments in Sanofi, but the amount is not significant.': 'Other',
 'Press Partly Allowed': 'Other',
 'Probably Exaggerated ': 'Other',
 'Rebatido por expertos': 'Other',
 'Report Gave Survey Results': 'Other',
 'Russia may very well be interfering in Sanders’ campaign, but to blame all hostility on a foreign adversary leaves out important information.': 'Other',
 'Ryan Allowed Them': 'Other',
 'Sanders Endorsed Joe Biden': 'Other',
 'Satire': 'Other',
 'Shaky Estimate': 'Other',
 'Some Still Left': 'Other',
 'Spins the Facts': 'False',
 'Stories Were Accurate': 'Other',
 'Studies Show Mixed Results': 'Other',
 'Studies: Already Done': 'Other',
 'Takes Too Much Credit': 'Other',
 'Takes Undue Credit': 'Other',
 'The assertion cannot be judged on a true/false scale until we know more, but even conservative estimates indicate more people will die of coronavirus.': 'Other',
 'The crash was in 2013': 'Other',
 'The headline is false.': 'Other',
 'The information in this viral post has been rated false.': 'Other',
 'The statement is false; the video of Biden saying this was edited deceptively.': 'Other',
 'There Are Others': 'Other',
 'There Was Contact': 'Other',
 'There Were Many Reasons': 'Other',
 'There Were Warnings': 'Other',
 'There is currently no such ban': 'Other',
 'There is no evidence that the caucuses were “rigged,” or that the problems with the Shadow app had anything to do with Buttigieg’s connections to those behind it. ': 'Other',
 "There's No 28th Amend.": 'Other',
 'They Are Already Eligible': 'Other',
 "They Aren't Yet": 'Other',
 'This is disputed. ': 'Other',
 'This is exaggerated.': 'False',
 'This is exaggerated. ': 'Other',
 'This is misleading': 'False',
 'This is misleading.': 'False',
 'This is misleading. ': 'Other',
 'This lacks evidence.': 'False',
 'This lacks evidence. ': 'False',
 'Three Pinocchios': 'False',
 'Timeline': 'Other',
 'Too Early to Say': 'Other',
 'Too Soon to Tell': 'Other',
 'True': 'True',
 'Trump Admin. Provided Number': 'Other',
 "Trump Hasn't Been Clear": 'Other',
 'Trump has restricted travel from China, but he has not shut it down or implemented a travel ban.': 'Other',
 'Trump has tried to frame the "crisis" on the border in easy to understand terms. Much of that messaging has focused on the idea that lax security has led to drugs and crime "pouring" over the border. Trump bases the claim on the idea that members of the MS-13 gang have committed violent crimes in some parts of the United States.    But there is missing context in Trump\'s characterizations. To begin with, the vast majority of illicit drugs captured at the border come through legal ports of entry, according to Trump\'s own U.S. Customs and Border Protection. For the first 11 months of the 2018 fiscal year, 90 percent of the heroin intercepted at the border and 88 percent of the cocaine, was captured at a legal port of entry rather than between those ports. There is a similar caveat to Trump\'s claim on crime. The president and Republicans frequently point to roughly 17,000 people trying to enter the country who had been convicted of crimes in the United States or abroad in the first 11 months of 2018. But more than 60 percent of those people came through legal points of entry, including airports, according to Customs and Border Protection data. ': 'Other',
 'Two Pinocchios': 'False',
 "U.N. Doesn't Endorse Estimate": 'Other',
 'Unclear': 'Other',
 'Unclear What Impact Will Be': 'Other',
 'Unconfirmed': 'Other',
 'Unsupported': 'False',
 'Vaccine Benefits Outweigh Risk': 'Other',
 'Verdict Pending': 'Other',
 "Voters Can't Vote Twice": 'Other',
 'We Explain Her Tally': 'Other',
 'We Explain How': 'Other',
 'We Explain The Science ': 'Other',
 'We Explain Why': 'Other',
 'We Explain Why ': 'Other',
 'We Explain Zelensky Comments': 'Other',
 'We Explain that Figure': 'Other',
 'We Explain the Dispute': 'Other',
 'We Explain the Law': 'Other',
 'We Explain the Proposal': 'Other',
 'We Explain the Research': 'Other',
 'We Explain the Science': 'Other',
 'We Provide The Context': 'Other',
 'We Provide the Facts': 'Other',
 'We Review His Record': 'Other',
 'We Review His Rhetoric': 'Other',
 'We Review Research ': 'Other',
 "We Review Stone's Crimes": 'Other',
 'We Review The Facts': 'Other',
 'We Review The Research': 'Other',
 'We Review the Data': 'Other',
 'We Review the Trends': 'Other',
 'While the United States is ramping up screening and testing at airports, few people as of March 13 were being tested. Some were not even screened.': 'Other',
 'Wrong': 'False',
 'You Be the Judge': 'Other',
 'false': 'False',
 'misleading': 'Other',
 'no toda la historia': 'False',
 'partially false': 'False',
 'requiere más estudios': 'Other', 
 'Unsubstantiated': 'False'}


In [438]:
df_concat_final = pd.read_csv('df_concat_final.csv')
df_politifact_merged = pd.read_csv('df_politifact_merged.csv')
df_concat_lead_stories = pd.read_csv('df_concat_lead_stories.csv')
df_concat_checkyourfact = pd.read_csv('df_concat_checkyourfact.csv')
df_concat_newsweek = pd.read_csv('df_concat_newsweek.csv')
df_aggregate = pd.concat([df_concat_final, df_politifact_merged, df_concat_lead_stories, df_concat_checkyourfact, df_concat_newsweek], axis=0)


In [439]:
df_aggregate = pd.concat([df_concat_final, df_politifact_merged, df_concat_lead_stories, df_concat_checkyourfact, df_concat_newsweek], axis=0)

df_aggregate.loc[pd.isnull(df_aggregate.datePublished), 'datePublished'] = df_aggregate.loc[pd.isnull(df_aggregate.datePublished), 'item_datePublished']
df_aggregate['factcheck_date'] = df_aggregate['datePublished']
df_aggregate['factcheck_date'] = df_aggregate['factcheck_date'].apply(lambda x: str(x)[:10])
df_aggregate = df_aggregate.drop(['item_datePublished', 'datePublished'], axis=1)
df_aggregate = pd.DataFrame(df_aggregate.loc[df_aggregate.factcheck_date != 'nan']).reset_index(drop=True)

df_aggregate.loc[pd.isnull(df_aggregate.url), 'url'] = df_aggregate.loc[pd.isnull(df_aggregate.url), 'item_url']
df_aggregate['factcheck_url'] = df_aggregate['url']
df_aggregate = df_aggregate.drop(['item_url', 'url'], axis=1)
df_aggregate = df_aggregate.drop_duplicates('factcheck_url')

df_aggregate = df_aggregate.rename({'item_claimReviewed': 'claim_text'}, axis=1)
df_aggregate.loc[(df_aggregate.author_url == 'https://leadstories.com') & (df_aggregate.reviewRating_ratingValue < 4), 'reviewRating_binary'] = 'False'
df_aggregate.loc[(df_aggregate.author_url == 'https://leadstories.com') & (df_aggregate.reviewRating_ratingValue >= 4), 'reviewRating_binary'] = 'True'

df_aggregate.loc[pd.isnull(df_aggregate.reviewRating_binary), 'reviewRating_binary'] = df_aggregate.loc[pd.isnull(df_aggregate.reviewRating_binary), 'reviewRating_alternateName'].apply(lambda x: label_dict[x])
df_aggregate = pd.DataFrame(df_aggregate.loc[df_aggregate.author_url != 'https://thedispatch.com/']).reset_index(drop=True)

df_aggregate['claim_date'] = df_aggregate['itemReviewed_datePublished']
df_aggregate['claim_author'] = df_aggregate['itemReviewed_author_name']
df_aggregate['claim_author_jobTitle'] = df_aggregate['itemReviewed_author_jobTitle']
df_aggregate['claim_context'] = df_aggregate['itemReviewed_name']


df_aggregate['claim_firstAppearance_url'] = df_aggregate['itemReviewed_firstAppearance_url']
df_aggregate.loc[pd.isnull(df_aggregate.claim_firstAppearance_url), 'claim_firstAppearance_url'] = df_aggregate.loc[pd.isnull(df_aggregate.claim_firstAppearance_url), 'itemReviewed_author_sameAs']
df_aggregate['claim_firstAppearance_url'] = df_aggregate['claim_firstAppearance_url'].apply(lambda x: x.replace("['", '') if "['" in str(x) else x)
df_aggregate['claim_firstAppearance_url'] = df_aggregate['claim_firstAppearance_url'].apply(lambda x: x.replace("']", '') if "']" in str(x) else x)
df_aggregate['factcheck_organization'] = df_aggregate['author_url']
df_aggregate['factcheck_author'] = df_aggregate['factchecker']
df_aggregate['reviewRating_original'] = df_aggregate['reviewRating_alternateName']
df_aggregate['claim_date'] = df_aggregate['claim_date'].apply(lambda x: str(x)[:10])

In [513]:
df_aggregate_final = df_aggregate[['factcheck_url', 'factcheck_date', 'factcheck_organization', 'factcheck_author',
              'claim_text', 'claim_author', 'claim_date', 'claim_firstAppearance_url', 'claim_author_jobTitle', 'claim_context',
              'reviewRating_binary', 'reviewRating_original', 'reviewRating_ratingValue', 'reviewRating_worstRating', 'reviewRating_bestRating']]

In [514]:
df_aggregate_final.head()

Unnamed: 0,factcheck_url,factcheck_date,factcheck_organization,factcheck_author,claim_text,claim_author,claim_date,claim_firstAppearance_url,claim_author_jobTitle,claim_context,reviewRating_binary,reviewRating_original,reviewRating_ratingValue,reviewRating_worstRating,reviewRating_bestRating
0,https://www.factcheck.org/2022/05/biden-hasnt-...,2022-05-06,https://www.factcheck.org/,Saranac Hale Spencer,"President Joe Biden has ""[s]topped abortion.""",Meme,2022-05-03,,-,,False,False,6.0,0.0,10.0
1,https://www.washingtonpost.com/politics/2022/0...,2022-05-06,https://www.washingtonpost.com/,Glenn Kessler,"""In Texas, Republicans passed a law allowing r...",Alexandria Ocasio-Cortez (,2022-04-29,https://twitter.com/AOC/status/152010807015153...,Member of the House (D-N.Y.),in a tweet,False,Four Pinocchios,0.0,0.0,5.0
2,https://www.factcheck.org/2022/05/desantis-vs-...,2022-05-05,https://www.factcheck.org/,D'Angelo Gore,“The bonds will be paid by Disney. They will b...,Ron DeSantis,2022-04-28,https://rumble.com/v12v53k-florida-and-the-ame...,Florida Governor,Fox News,Other,Unclear What Impact Will Be,,,
3,https://www.washingtonpost.com/politics/2022/0...,2022-05-05,https://www.washingtonpost.com/,Glenn Kessler,“You know what happens to these individuals? T...,Alejandro Mayorkas,2022-05-01,https://www.washingtonpost.com/politics/2022/0...,Homeland Secretary Secretary,in an interview on Fox News Sunday,False,Three Pinocchios,1.0,0.0,5.0
4,https://www.factcheck.org/2022/05/unfounded-cl...,2022-05-04,https://www.factcheck.org/,Saranac Hale Spencer,"A rash of ""mysterious"" fires at food-processin...",Social media posts,2022-04-21,,-,,Other,Fires Not Unusual,,,


In [524]:
dict(df_aggregate_final.loc[df_aggregate.factcheck_organization == 'https://www.washingtonpost.com/'].reset_index().loc[10])

{'claim_author': 'Amy Klobuchar',
 'claim_author_jobTitle': 'Senator (D-Minn.)',
 'claim_context': 'in an interview on ABC\'s "This Week"',
 'claim_date': '2022-03-27',
 'claim_firstAppearance_url': 'https://abcnews.go.com/Politics/week-transcript-27-22-sen-amy-klobuchar-gen/story?id=83690221',
 'claim_text': '“The facts are clear here. This is unbelievable. You have the wife of a sitting Supreme Court justice advocating for an insurrection, advocating for overturning a legal election to the sitting president’s chief of staff.”',
 'factcheck_author': 'Glenn Kessler',
 'factcheck_date': '2022-04-05',
 'factcheck_organization': 'https://www.washingtonpost.com/',
 'factcheck_url': 'https://www.washingtonpost.com/politics/2022/04/05/klobuchar-claims-ginni-thomas-advocated-an-insurrection/',
 'index': 44,
 'reviewRating_bestRating': 5.0,
 'reviewRating_binary': 'False',
 'reviewRating_original': 'Two Pinocchios',
 'reviewRating_ratingValue': 2.0,
 'reviewRating_worstRating': 0.0}

In [526]:
factchecker_set = []
for i in df_aggregate_final['factcheck_author']:
    if ',' in str(i):
        factchecker_set += i.split(',')
    else:
        factchecker_set.append(i)
print('# of factcheckers:', len(set(factchecker_set)))

# of factcheckers: 794


In [525]:
url_count = df_aggregate_final.groupby('claim_firstAppearance_url').count()['claim_text']
print('# of overlapping claims:', (url_count >=2).sum())

# of overlapping claims: 225


In [506]:
df_aggregate_final.loc[df_aggregate.reviewRating_binary != 'Other'].groupby('factcheck_organization').count()['factcheck_url']

factcheck_organization
https://checkyourfact.com              3194
https://factcheck.thedispatch.com/      140
https://leadstories.com                4530
https://www.factcheck.org/             1084
https://www.newsweek.com                258
https://www.nytimes.com/                 65
https://www.politifact.com/           20872
https://www.usatoday.com/               100
https://www.washingtonpost.com/         348
Name: factcheck_url, dtype: int64

In [566]:
len(df_aggregate_final)

31107

In [529]:
df_aggregate_final.to_csv('factcheck_organization_data_20220508.csv', index=False)

In [431]:
# manually categorizing each organization's reviewRating_alternateName
binary_rating_dict = {}
cnt=0
for i in df_aggregate.groupby('reviewRating_alternateName')['url'].count().sort_values(ascending=False).items():
    if i[1] <= 1:
        binary_rating_dict[i[0]] = 'Other'
    elif i[0] not in binary_rating_dict:
        continue
        value = input(str(cnt) +'/255 '+ str(i))
        if value == '':
            binary_rating_dict[i[0]] = 'False'
        elif value == 'o':
            binary_rating_dict[i[0]] = 'Other'
        else:
            binary_rating_dict[i[0]] = 'True'
        cnt+=1

# Append political stance

In [556]:
df_aggregate_final = pd.read_csv('factcheck_organization_data_20220508.csv')

In [557]:
header = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/65.0.3325.181 Safari/537.36"}
html = requests.get('https://www.politifact.com/personalities/', headers=header).content
soup = BeautifulSoup(html, 'lxml')
people = soup.find_all(class_='c-chyron')

In [558]:
results = []
for person in people:
    name = person.find(class_='c-chyron__value').get_text().strip()
    name_lower = name.lower()
    party = person.find(class_='c-chyron__subline').get_text().strip()
    results.append([name, name_lower, party])
df_party = pd.DataFrame(results, columns=['name', 'name_lower', 'party'])

In [565]:
df_party.to_csv('party_affiliation_20220508.csv', index=False)

In [None]:
df_party = pd.read_csv('party_affiliation_20220508.csv')

In [559]:
df_aggregate_final['name_lower'] = df_aggregate_final['claim_author'].str.lower()
df_aggregate_final['name_lower'] = df_aggregate_final['name_lower'].apply(lambda x: str(x).strip())
df_aggregate_final.loc[df_aggregate_final.claim_author == 'Donald J. Trump', 'name_lower'] = 'donald trump'
df_aggregate_final = df_aggregate_final.merge(df_party, on='name_lower', how='left')

In [562]:
df_aggregate_final['claim_author_party'] = df_aggregate_final['party']
df_aggregate_final = df_aggregate_final.drop(['name_lower', 'name', 'party'], axis=1)

In [563]:
df_aggregate_final.to_csv('factcheck_organization_data_20220508_party.csv', index=False)

In [564]:
df_aggregate_final

Unnamed: 0,factcheck_url,factcheck_date,factcheck_organization,factcheck_author,claim_text,claim_author,claim_date,claim_firstAppearance_url,claim_author_jobTitle,claim_context,reviewRating_binary,reviewRating_original,reviewRating_ratingValue,reviewRating_worstRating,reviewRating_bestRating,claim_author_party
0,https://www.factcheck.org/2022/05/biden-hasnt-...,2022-05-06,https://www.factcheck.org/,Saranac Hale Spencer,"President Joe Biden has ""[s]topped abortion.""",Meme,2022-05-03,,-,,False,False,6.0,0.0,10.0,
1,https://www.washingtonpost.com/politics/2022/0...,2022-05-06,https://www.washingtonpost.com/,Glenn Kessler,"""In Texas, Republicans passed a law allowing r...",Alexandria Ocasio-Cortez (,2022-04-29,https://twitter.com/AOC/status/152010807015153...,Member of the House (D-N.Y.),in a tweet,False,Four Pinocchios,0.0,0.0,5.0,
2,https://www.factcheck.org/2022/05/desantis-vs-...,2022-05-05,https://www.factcheck.org/,D'Angelo Gore,“The bonds will be paid by Disney. They will b...,Ron DeSantis,2022-04-28,https://rumble.com/v12v53k-florida-and-the-ame...,Florida Governor,Fox News,Other,Unclear What Impact Will Be,,,,Republican
3,https://www.washingtonpost.com/politics/2022/0...,2022-05-05,https://www.washingtonpost.com/,Glenn Kessler,“You know what happens to these individuals? T...,Alejandro Mayorkas,2022-05-01,https://www.washingtonpost.com/politics/2022/0...,Homeland Secretary Secretary,in an interview on Fox News Sunday,False,Three Pinocchios,1.0,0.0,5.0,
4,https://www.factcheck.org/2022/05/unfounded-cl...,2022-05-04,https://www.factcheck.org/,Saranac Hale Spencer,"A rash of ""mysterious"" fires at food-processin...",Social media posts,2022-04-21,,-,,Other,Fires Not Unusual,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31102,https://www.newsweek.com/fact-check-has-georgi...,2020-11-17,https://www.newsweek.com,Marlena Lang,Georgia Senatorial Candidate Jon Ossoff Has Do...,David Perdue,2020-11-17,,,,False,False,,,,Republican
31103,https://www.newsweek.com/fact-check-are-fox-ne...,2020-11-12,https://www.newsweek.com,Marlena Lang,"Fox News' Daytime Ratings Are Collapsing, as T...",President Donald Trump,2020-11-12,,,,False,False,,,,
31104,https://www.newsweek.com/fact-check-did-domini...,2020-11-12,https://www.newsweek.com,Matthew Impelli,Dominion Voting Systems Caused Widespread Vote...,President Donald Trump,2020-11-12,,,,False,False,,,,
31105,https://www.newsweek.com/fact-check-us-roundin...,2020-10-26,https://www.newsweek.com,Matthew Impelli,"U.S. is 'Rounding the Turn' On COVID, as Trump...",President Donald Trump,2020-10-26,,,,False,False,,,,


#Community-based: Birdwatch

In [None]:
!wget -O notes-00000.tsv https://ton.twimg.com/birdwatch-public-data/2022/05/08/notes/notes-00000.tsv
!wget -O ratings-00000.tsv https://ton.twimg.com/birdwatch-public-data/2022/05/08/noteRatings/ratings-00000.tsv

--2022-05-08 07:53:11--  https://ton.twimg.com/birdwatch-public-data/2022/05/08/notes/notes-00000.tsv
Resolving ton.twimg.com (ton.twimg.com)... 152.199.43.82, 2606:2800:247:cb3f:61f1:e081:ac02:df4c
Connecting to ton.twimg.com (ton.twimg.com)|152.199.43.82|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12927648 (12M) [text/tab-separated-values]
Saving to: ‘notes-00000.tsv’


2022-05-08 07:53:13 (10.8 MB/s) - ‘notes-00000.tsv’ saved [12927648/12927648]

--2022-05-08 07:53:14--  https://ton.twimg.com/birdwatch-public-data/2022/05/08/noteRatings/ratings-00000.tsv
Resolving ton.twimg.com (ton.twimg.com)... 152.199.43.82, 2606:2800:247:cb3f:61f1:e081:ac02:df4c
Connecting to ton.twimg.com (ton.twimg.com)|152.199.43.82|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32080726 (31M) [text/tab-separated-values]
Saving to: ‘ratings-00000.tsv’


2022-05-08 07:53:16 (13.7 MB/s) - ‘ratings-00000.tsv’ saved [32080726/32080726]



In [None]:
notes = pd.read_csv('notes-00000.tsv', delimiter="\t")

In [None]:
(notes.groupby('tweetId').count()['noteId'] >= 2).mean()

0.17707415888673594