Permalink
Browse files

🔥 web scraping hell 🔥

  • Loading branch information...
N2ITN committed Oct 12, 2017
1 parent 698586d commit c1f35b005bbc89debe64a77af9e21b112c1048e0
Showing with 62 additions and 30 deletions.
  1. +62 −30 MBFC.py
View
92 MBFC.py
@@ -1,4 +1,9 @@
import json
from multiprocessing.dummy import Pool
from time import sleep
import unicodedata
import httplib2
import requests
from bs4 import BeautifulSoup, SoupStrainer
http = httplib2.Http()
@@ -13,57 +18,84 @@
fake-news
satire'''.split('\n')
import requests
class accumulator:
cat = None
json_ = []
def cat_links(cat):
accumulator.cat = cat
response = requests.get('https://mediabiasfactcheck.com/' + cat).text
s = BeautifulSoup(response, 'html.parser').find(class_='entry clearfix')
links_ = BeautifulSoup(str(s), 'html.parser', parse_only=SoupStrainer('a'))
return links_
for link in BeautifulSoup(str(s), 'html.parser', parse_only=SoupStrainer('a')):
if link.has_attr('href') and link['href'].startswith('http'):
page = link['href']
if page.startswith('https://mediabiasfactcheck.com/' + cat):
continue
def get_links(link):
sleep(2)
if link.has_attr('href') and link['href'].startswith('http'):
page = link['href']
x = BeautifulSoup(requests.get(page).text, 'html.parser').find(class_='entry-content')
def check_page():
if page.startswith('https://mediabiasfactcheck.com/' + accumulator.cat):
return
try:
tag_ = BeautifulSoup(requests.get(page).text, 'html.parser').find(class_='entry-content')
return tag_
except requests.exceptions.ConnectionError:
return
z = list(filter(lambda _: 'Factual' in _.text, x.find_all('p')))
def get_facts(tag_):
p_list = list(filter(lambda _: 'Factual' in _.text, tag_.find_all('p')))
p_list = unicodedata.normalize('NFKD', p_list[0].text.split('\n')[0])
return p_list.split('strong')[0].split(': ')[1]
fact_line = z[0].text.split('strong')[0].split('\xa0')
def get_site_url(tag_):
try:
fact_line = fact_line[1]
except IndexError:
fact_line = fact_line[0].split(':')[1]
factual_reporting = fact_line
else:
factual_reporting = fact_line
yield {'site': page, 'cat1': cat, 'cat2': factual_reporting}
return list(
filter(lambda _: 'Source:' in _.text or 'Sources:' in _.text, tag_.find_all('p')))[
0].text.split()[1]
except Exception as e:
import json
print('Warning: URL not found in {}'.format(page))
return 'URL not found'
tag_ = check_page()
if tag_ is None:
print('Failed to load page {}'.format(page))
return {'error': 'Failed to load page {}'.format(page)}
get_site_url(tag_)
''''''
results = {
'cat1': accumulator.cat,
'cat2': get_facts(tag_),
'url': get_site_url(tag_),
'reference': page
}
print(results)
''''''
accumulator.json_.append(results)
def cat_json():
gen_list = list(linker())
json_ = []
for cat_gen in gen_list:
json_ += list(cat_gen)
json.dump(open('mbfc.json', 'w'), json_)
def cat_json():
category_pages = (cat_links(cat) for cat in cat_pages)
for page in category_pages:
pool = Pool(10)
pool.map(get_links, page)
# [get_links(p) for p in page]
print(accumulator.json_)
def linker():
for cat in cat_pages:
yield cat_links(cat)
json.dump(open('mbfc.json', 'w'), accumulator.json_)
cat_json()
'''
TODO:
Add threadpool
Make better variables and less hacky error handling
'''
Make better variables and less hacky error handling
'''

0 comments on commit c1f35b0

Please sign in to comment.