In [1]:
import requests
import re
from time import sleep
import io
import tldextract
import glob
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
from multiprocessing import Pool
from bs4 import BeautifulSoup

import cPickle as pickle

In [2]:
success_articles = pickle.load(open("snopes_data/success_articles.p"))

In [3]:
archive_dot_is_mappings = {x.split("\t")[0] : x.strip().split("\t")[1] for x in io.open("snopes_data/archive_dot_is_mappings.txt")}

In [4]:
t_dot_co_mappings = {x.split("\t")[0] : x.strip().split("\t")[1] for x in io.open("snopes_data/t_dot_co_mappings.tsv")}

In [5]:
bs = set(['blogspot.com',"wordpress.com",'go.com','yahoo.com','googleusercontent.com','blogspot.nl','tumblr.com'])
def construct_url(u):
    full_url = u[2]
    if u[1] == 'twitter.com':
        full_url = full_url.replace("https://","")
        full_url = full_url.replace("?src=hash","").lower()
        x = full_url.rfind("/status/")
        if x > 0:
            return full_url[:full_url.rfind("/status/")]
        if len(full_url) > 50:
            return u[1]
        return full_url
    
    if u[1] == 'archive.is':
        if u[2] in archive_dot_is_mappings:
            url = archive_dot_is_mappings[u[2]]
            return construct_url(('',extract_base_url(url),url))
        return u[1]
    
    if u[1] == 't.co':
        if u[2] in t_dot_co_mappings:
            url = t_dot_co_mappings[u[2]]
            return construct_url(('',extract_base_url(url),url))
        return u[1]
    
    if u[1] == 'archive.org':
        fin = re.search("web/[0-9]+/",u[2])
        if fin:
            return u[2][fin.end():]
        return u[1]
    
    if u[1] == 'facebook.com':
        full_url = "/".join(u[2].split("/")[2:4])
        if len(full_url) > 50:
            return u[1]
        return full_url.replace("www.","")
    
    if u[1] == 'reddit.com':
        return "/".join(u[2].split("/")[2:5]).replace("www.","")
    

    if "webcache.googleusercontent.com" in u[2]:
        out = u[2][re.search("q=cache:[A-Za-z0-9\-_]+:",u[2]).end():]
        if out.startswith("//"):
            out = out[2:]
        return construct_url(('',extract_base_url(out),out))

    if u[1] in bs:
        return "/".join(u[2].split("/")[2:3]).replace("www.","")
    
    return u[1]


def extract_base_url(u):
    extract_res = tldextract.extract(u)
    if len(extract_res.domain) and len(extract_res.suffix):
        return ".".join([extract_res.domain,extract_res.suffix])
    return u

In [6]:
false_re = re.compile('(false|incorrect|inaccurate|unproven)',re.I|re.U)
true_re = re.compile("(true)",re.I|re.U)
good_cat = {"Political News", "Politics","Fact Check","Fake News"}

res = []

for x in success_articles:
    
    # indicative of a failed parse
    if len(x['result']) > 100:
        continue
        
    # get truth value
    is_false = len(false_re.findall( x['result'])) 
    is_true = len(true_re.findall(x['result']))
    tv = "none"
    if is_false and is_true:
        tv = 'both'
    elif is_false:
        tv = 'false'
    elif is_true:
        tv = 'true'
        
    is_political = False
    if  len(set(x['categories'])&good_cat):
        is_political = True

        
    article_title = x['article_url'][(x['article_url'].rfind("/")+1):]
    for u in x['urls']:
        # can't resolve the donotlink stuff, unfortunately
        if u[1] == 'donotlink.com':
            continue
        is_article_tag = u[0] == 'article'
        is_archive = u[1] == 'archive.is'
        res_df = {
                "orig_url":u[1].lower(),
                "anchor_text":u[0].lower(),
                "article":article_title.lower(),
                "paragraph":u[-1].lower(),
                "full_url":u[2].lower(),
                "clean_url":construct_url(u).lower(),
                "parent":u[3],
                "parent_parent":u[4],
                "truth":tv,
                "is_pol":is_political,
                'is_article_tag':is_article_tag,
                'is_archive_dot_is':is_archive,
                'categories':x['categories']
            }
        if is_archive:
            res_df['full_archive_url'] = archive_dot_is_mappings.get(u[2],"")
        res.append(res_df)

In [7]:
full_df = pd.DataFrame(res)

In [8]:
full_df.shape

(31085, 14)

In [11]:
full_df.to_csv("snopes_fake_news.csv",encoding="utf8")