In [5]:
from bs4 import BeautifulSoup
import requests
import re
from time import sleep
import io
import tldextract
import glob
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
from multiprocessing import Pool
import cPickle as pickle
import io

# Helper Functions

In [2]:
bad_url_re = re.compile("(snopes.com|photo|.jpg|.png)",re.I|re.U)

def parse_article(dat):
    i,filename = dat
    if i % 250 == 0:
        print i
    
    page_html = io.open(filename).read()
    soup = BeautifulSoup(page_html,"lxml")
    article_text = soup.find("div",{"class":"article-text"})
    
    
    if not article_text:
        #print 'No article text!!! ', filename
        return {"article_url": filename,
                "parse_result" : "no_article_text"}
    
    ############## URL STUFF ###########################

    all_urls = set()
    article_urls= []
    ems = []

    ## URLs where we can also extract the paragraph text
    article_paragraphs = article_text.find_all("p")
    for p in article_paragraphs:
        if len(p.text.split()) < 10:
            continue

        for url in p.find_all("a"):
            try:
                extract_res = tldextract.extract(url['href'])
                if len(extract_res.domain) and len(extract_res.suffix):
                    url_val = ".".join([extract_res.domain,extract_res.suffix])
                    if not len(bad_url_re.findall(url['href'])):
                        article_urls.append((url.get_text(),url_val,url['href'],url.parent.name,url.parent.parent.name,p.text))
                        all_urls.add(url['href'])
            except:
                continue

        if not p.parent.has_attr("class") or 'article-sources-box' not in p.parent['class']:
            for emph in p.find_all("em"):
                ems.append((emph.get_text(),p.text))

    # get URLs not attached to particular paragraphs
    for url in article_text.find_all("a"):
        if not url.has_attr("href"):
            continue
        if url['href'] in all_urls:
            continue
        try:
            extract_res = tldextract.extract(url['href'])
            if len(extract_res.domain) and len(extract_res.suffix):
                url_val = ".".join([extract_res.domain,extract_res.suffix])
                if not len(bad_url_re.findall(url['href'])):
                    article_urls.append((url.get_text(),url_val,url['href'],url.parent.name,url.parent.parent.name,''))
                    urls= True
        except:
            print 'url fail'
            continue

    #############  CLAIM STUFF ########################
    claim = None
    result = None
    
    if not soup.find("p",{"itemprop":"claimReviewed"}):
        article_paragraphs = article_text.find_all("p")
        for p_it, paragraph in enumerate(article_paragraphs): 
            for span in paragraph.find_all("span"):
                if span.get_text() == "Claim":
                    claim = paragraph.get_text().replace("Claim: ", "").strip()
                    if len(article_paragraphs) > p_it + 1:
                        result = article_paragraphs[p_it+1].get_text().strip()
                        break
                elif span.get_text() == "Claim:":
                    claim = paragraph.get_text().replace("Claim:", "").strip()
                    result_obj = article_text.find("div", {"class": "claim-old"})
                    if result_obj:
                        result = result_obj.get_text()
                    elif len(article_paragraphs) > p_it + 1:
                        result = article_paragraphs[p_it+1].get_text().strip()
                break

            for strong in paragraph.find_all("strong"):
                if strong.get_text().startswith("Claim"):
                    claim = paragraph.get_text().replace("Claim: ", "").strip()
                    if len(article_paragraphs) > p_it + 1:
                        result = article_paragraphs[p_it+1].get_text()
                    break

            for font in paragraph.find_all("font"):
                if font.get_text().startswith("Claim"):
                    claim = paragraph.get_text().replace("Claim: ", "").strip()

                    if len(article_paragraphs) > p_it + 1 and "Status" in article_paragraphs[p_it+1].get_text():
                            result = article_paragraphs[p_it+1].get_text().replace("Status: ","").strip()
                    else:
                        for tab in article_text.find_all("table"):
                            tab_fonts = tab.find_all("font")
                            if tab_fonts and len(tab_fonts):
                                result = tab_fonts[0].get_text()
                    break

            if claim:
                break
    else:
        claim = soup.find("p",{"itemprop":"claimReviewed"}).get_text().strip()
        result = soup.find("div",{"class":"claim"}).get_text().strip()

    categories = []
    try:
        categories = [x.text for x in soup.find("div",{"class": "breadcrumb-nav"}).find_all("a")]
    except:
        print 'category_fail'
        
    if not result:
        #print 'No result!! ', filename
        return {"article_url": filename,
                "parse_result" : "no_result"}
    
    return {"article_url": filename,
            'categories' : categories,
            "claim":claim,
            "result" : result,
            "urls" : article_urls,
            'ems' : ems,
            "parse_result" : "ok"}

# Get Article Parses

In [4]:
p = Pool(6)
results = p.map(parse_article, enumerate(glob.glob("snopes_data/article_htmls/*")))

success_articles = []
failed_articles = []

for a in results:
    if a['parse_result'] == 'ok':
        success_articles.append(a)
    else:
        failed_articles.append(a)
        
p.close()
#p.terminate()

0
1750
500
2250
1000
1500
250
2000
750
2500
1250
3000
4750
3500
4000
2750
4500
3250
5000
3750
4250
6000
6500
5250
7000
5750
7500
6250
6750
5500
7250
9000
7750
9500
8250
10000
8750
9250
8000
9750
8500
10250


# Get Archive.is Links

***Note: Some of these archived links may no longer be available. This data is saved in*** ```snopes_data/archive_dot_is_mappings.txt``` 

In [12]:

archived_urls = []
url_res_from_archive = []
failed_urls = []
for article in success_articles:
    url_set = set()
    for url in article['urls']:
        if 'archive.is' == url[1]:
            archived_urls.append(url[2])
print 'N archived: ', len(archived_urls)


N archived:  1135


In [14]:
for i,archived_link in enumerate(archived_urls):
    if i % 50 == 0:
        print i
    if 'web.' in archived_link:
        continue
    page_html = requests.get(archived_link)
    soup = BeautifulSoup(page_html.text,"lxml")
    v = soup.find("input",{"name":"q"})
    if v:
        url_res_from_archive.append((archived_link,v.get('value','')))
    else:
        failed_urls.append(article)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100


# Expand t.co links


***Note: Some of these archived links may no longer be available. This data is saved in*** ```snopes_data/t_dot_co_mappings.tsv``` 

In [2]:

archived_tco_urls = []
for article in success_articles:
    url_set = set()
    for url in article['urls']:
        if 't.co' == url[1]:
            archived_tco_urls.append(url[2])
print 'N archived: ', len(archived_tco_urls)


N archived:  1304


In [9]:
from grabURL import safeGetURLAndMetadata
from time import sleep
fail = []
tco_expanded = []
for i, a in enumerate(archived_tco_urls):
    if i % 100 == 0:
        print i
    tco_expanded.append(safeGetURLAndMetadata(a, fastOnlyExpandURL=True))
    #sleep(1)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200




1300


# Save Results

In [16]:
of = io.open("snopes_data/archive_dot_is_mappings.txt","w")
for u in url_res_from_archive:
    of.write(u[0] + u"\t" + u[1] + u"\n")
of.close()

In [None]:

of = io.open("snopes_data/t_dot_co_mappings.tsv","w")
for u in tco_expanded:
    if 'error' not in u:
        of.write(u['initial_url'] + u"\t" + u['canonical_url'] + u"\n")
of.close()

In [6]:
pickle.dump(success_articles, open("snopes_data/success_articles.p","wb"))