In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
files = [
    'credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv',
#    'credco_webconf_study_2_study_2_project_1_2018_02_21t22_44_07_00_00_anon_nolink.csv',
#    'credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv'
]

In [3]:
file_paths = [os.path.join('credibilitycoalition-webconf-2018', 'data', file) for file in files]
print(file_paths)

['credibilitycoalition-webconf-2018/data/credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv']


In [4]:
data = []
labels = []
for file_path in file_paths:
    with open(file_path) as csvfile:
        csv_reader = csv.reader(csvfile)
        for idx, row in enumerate(csv_reader):
            if idx == 0:
                labels.append(row)
            if idx > 0:
                data.append(row)

In [5]:
# labels are of different size but the largest one is a superset of all the others
# so we'll use the labels array that is the largest in size
label = labels[np.argmax([len(label) for label in labels])]

In [6]:
# function will extract a column of data given the index
get_data_col = lambda data, idx: [col[idx] for col in data if idx < len(col)]

# function will extract data columns given titles from the master label
get_data_col_from_titles = lambda data, titles, label: [get_data_col(data, label.index(col)) for col in titles] 

# Get Report Data

In [7]:
import re

r = re.compile('report_title')
report_title_labels = list(filter(r.match, label))
print('There are {} report_title_label columns'.format(len(report_title_labels)))
print(report_title_labels)

r = re.compile('media_content')
media_content_labels = list(filter(r.match, label))
print('There are {} media_content_label columns'.format(len(media_content_labels)))
print(media_content_labels)

r = re.compile('media_url')
media_urls_labels = list(filter(r.match, label))
print('There are {} media_urls_label columns'.format(len(media_urls_labels)))
print(media_urls_labels)

There are 1 report_title_label columns
['report_title']
There are 1 media_content_label columns
['media_content']
There are 1 media_urls_label columns
['media_url']


In [8]:
report_titles = get_data_col_from_titles(data, report_title_labels, label)[0]
print('There are {} report_title rows'.format(len(report_titles)))

media_content = get_data_col_from_titles(data, media_content_labels, label)[0]
print('There are {} media_content rows'.format(len(media_content)))

media_urls = get_data_col_from_titles(data, media_urls_labels, label)[0]
print('There are {} media_url rows'.format(len(media_urls)))

There are 50 report_title rows
There are 50 media_content rows
There are 50 media_url rows


In [9]:
print('There are {} unique report_title rows'.format(len(set(report_titles))))
print('There are {} unique media_content rows'.format(len(set(media_content))))
print('There are {} unique media_url rows'.format(len(set(media_urls))))

There are 47 unique report_title rows
There are 46 unique media_content rows
There are 50 unique media_url rows


In [10]:
import requests
#from bs4 import BeautifulSoup

invalid_urls = []
for url in set(media_urls):
    print('Fetching URL: {}'.format(url))
    try:
        s = requests.Session()
        s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
        r = s.get(url)
        if not r.ok:
            invalid_urls.append(url)
    except Exception as e:
        print('Error fetching: {}'.format(url))
        print(e)
        invalid_urls.append(url)
    finally:
        s.close()

Fetching URL: http://www.breitbart.com/big-government/2017/06/13/delingpole-ship-of-fools-iii-global-warming-study-cancelled-because-of-unprecedented-ice/
Fetching URL: https://www.nytimes.com/interactive/2017/06/09/climate/antarctica-rift-update.html
Fetching URL: https://www.today.com/health/birth-order-first-borns-get-intellectual-advantage-t108042
Fetching URL: https://www.ntd.tv/inspiring/life/9-sleeping-positions-improve-health.html
Fetching URL: http://inshapetoday.com/now-official-fda-announced-vaccines-causing-autism/
Fetching URL: https://www.huffingtonpost.com/entry/vivek-murthy-report-on-drugs-and-alcohol_us_582dce19e4b099512f812e9c
Fetching URL: https://www.naturalnews.com/2017-10-02-lone-gunman-theory-of-las-vegas-shooter-is-complete-nonsense-stephen-paddock.html
Fetching URL: https://ewao.com/2017/09/16/six-pharmaceutical-medicines-that-instantly-make-your-health-worse/
Fetching URL: https://www.littlethings.com/hand-food-and-mouth-disease/
Fetching URL: https://www.thea

### Check Invalid URLs

In [11]:
#[print(invalid_url) for invalid_url in invalid_urls]
[print(invalid_url) for invalid_url in set(invalid_urls)]
print('Number of invalid urls: {}'.format(len(invalid_urls)))
print('Number of unique invalid urls: {}'.format(len(set(invalid_urls))))

https://www.ntd.tv/inspiring/parenting/sam-berns-boy-aging-disease-progeria-dies-17.html
https://awarenessact.com/stop-calling-your-drug-addiction-a-disease/
https://www.ntd.tv/inspiring/life/9-sleeping-positions-improve-health.html
https://ewao.com/2017/08/16/johns-hopkins-researcher-releases-shocking-report-on-flu-vaccines/
http://www.wate.com/news/national-world/measles-making-comeback-as-parents-opt-out-of-vaccines/792747092
http://publichealthabc.com/2-cups-day-1-week-stomach-will-flatter/
https://ewao.com/2017/09/16/six-pharmaceutical-medicines-that-instantly-make-your-health-worse/
http://goodfullness.net/which-banana-would-you-eat-your-answer-may-have-an-effect-on-your-health/
https://info.cmsri.org/the-driven-researcher-blog/vaccinated-vs.-unvaccinated-guess-who-is-sicker
Number of invalid urls: 10
Number of unique invalid urls: 10


In [12]:
media_urls_set = [url for url in set(media_urls)]
report_titles_set = [title for title in set(report_titles)]
media_content_set = [content for content in set(media_content)]

In [13]:
print(len(media_urls_set))
print(len(report_titles_set))
print(len(media_content_set))

50
47
46


In [14]:
# print csv data from invalid urls
for invalid_url in invalid_urls:
    idx = media_urls.index(invalid_url)
    print(media_urls[idx])
    print(report_titles[idx])
    print(media_content[idx])
    print('')

https://www.ntd.tv/inspiring/life/9-sleeping-positions-improve-health.html



https://ewao.com/2017/09/16/six-pharmaceutical-medicines-that-instantly-make-your-health-worse/
Six Pharmaceutical Drugs That Immediately Destroy Your Health
Some pharmaceutical medicines can cause you immediate physiological damage, keep reading to find out which ones! A new CDC report confirmed that half of all Americans take some pharmaceutical drug for their sicknesses. Over 150 million citizens in the U.S. are consuming medications that are tests or even unproven products. This is alarming since taking so much medication can cause many health-threatening problems in the human body. The number of Americans who take medication increased 10% since the early 1990s. It used to be illegal for pharmaceutical companies to push legal drugs onto people, but now with the rise in drug advertisements the number of people using them has snowballed. If you feel caught up in the pharmaceutical drug web, then you should 

## Get Annotated Results for Number of Ads

In [15]:
r = re.compile('task_question_17')
ad_question_labels = list(filter(r.match, label))
print('There are {} ad_question_labels columns'.format(len(ad_question_labels)))
print(ad_question_labels)

r = re.compile('task_answer_17')
ad_answer_labels = list(filter(r.match, label))
print('There are {} ad_answer_labels columns'.format(len(ad_answer_labels)))
print(ad_answer_labels)

There are 1 ad_question_labels columns
['task_question_17']
There are 1 ad_answer_labels columns
['task_answer_17']


In [16]:
ad_question = get_data_col_from_titles(data, ad_question_labels, label)[0]
print('There are {} ad_question rows'.format(len(ad_question)))

ad_answer = get_data_col_from_titles(data, ad_answer_labels, label)[0]
print('There are {} ad_answers rows'.format(len(ad_answer)))

There are 50 ad_question rows
There are 50 ad_answers rows


In [17]:
for title, url, n_ads in zip(report_titles, media_urls, ad_answer):
    print(n_ads, url)

1 https://www.independent.co.uk/news/world/asia/india-floods-bangladesh-nepal-deaths-millions-homeless-latest-news-updates-a7919006.html
8 https://www.ntd.tv/inspiring/life/9-sleeping-positions-improve-health.html
4 https://qz.com/1064364/hurricane-harvey-houstons-flooding-made-worse-by-unchecked-urban-development-and-wetland-destruction
5 https://www.independent.co.uk/life-style/health-and-families/donald-trump-mental-illness-narcisissm-us-president-psychologists-inauguration-crowd-size-paranoia-a7552661.html
0 https://info.cmsri.org/the-driven-researcher-blog/vaccinated-vs.-unvaccinated-guess-who-is-sicker
5 https://medicalxpress.com/news/2017-08-reverse-aging-brain.html
4 http://www.nationspressph.com/2017/02/pls-share-do-not-eat-this-fish-it-is.html
20 https://dailyhealthpost.com/preventing-alzheimers/
4 https://ewao.com/2017/09/16/six-pharmaceutical-medicines-that-instantly-make-your-health-worse/
4 https://mic.com/articles/176092/under-the-gop-s-health-plan-sexual-assault-would-b

# Count Ads

In [18]:
from bs4 import BeautifulSoup

url = media_urls[19]
print(url)

s = requests.Session()
s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
r = s.get(url)
if r.ok:
    print(r)

https://www.naturalnews.com/2017-10-02-lone-gunman-theory-of-las-vegas-shooter-is-complete-nonsense-stephen-paddock.html
<Response [200]>


In [19]:
soup = BeautifulSoup(r.content, 'html')
#print(soup.prettify()) # print the parsed data of html

In [20]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://ogp.me/ns/fb#" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <title>
   MISSION IMPOSSIBLE: Official story of Las Vegas shooting unravels; physical impossibility of lone gunman senior citizen makes narrative ludicrous – NaturalNews.com
  </title>
  <meta content="noodp,noydir" name="robots"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <!-- ///////////////////////////////////////////////////////////////////////////// -->
  <meta content="health ranger,automatic weapons,conspiracy theory,investigation,isis,las vegas,mandalay bay,mass shooting,scapegoat,stephen paddock,terrorism" name="keywords"/>
  <!-- ///////////////////////////////////////////////////////////////////////////// -->
  <style type="text/css">
   .slide-excerpt { width: 50%; }
			.slide-excerpt { bottom: 0; }
			.slide-excerpt { right: 0; }
			.flexslider { max-width: 920px; max-height: 400px; }
			.slide-image { max-hei

In [21]:
# assume that adds are wrapped in a <div> tag and has an `id` and a `class` attribute where the `class` contains the start of the word "ad" in it
#re_soup = soup.find_all('div', id=re.compile('.*'), class_=re.compile('[^\w]ad[s|\w]?'))
#re_soup = soup.find_all('div', class_=re.compile('[^\w][Aa][Dd]'))
#re_soup = soup.find_all('div', class_=re.compile('[Aa][Dd]'))
re_soup = soup.find_all(text=re.compile('[sS]ponsor'))

print('Found {} tags'.format(len(re_soup)))
for idx, result in enumerate(re_soup):
    print(idx, result)

Found 1 tags
0 Natural News Wire (Sponsored Content)


In [22]:
# assume that adds are wrapped in a <div> tag and has an `id` and a `class` attribute where the `class` contains the start of the word "ad" in it
#re_soup = soup.find_all('div', id=re.compile('.*'), class_=re.compile('[^\w]ad[s|\w]?'))
re_soup = soup.find_all('div', class_=re.compile('[^\w][Aa][Dd]'))

print('Found {} tags'.format(len(re_soup)))
for idx, result in enumerate(re_soup):
    print(idx, result)

Found 2 tags
0 <div class="addtoany_share_save_container addtoany_content_bottom"><div class="a2a_kit a2a_kit_size_32 addtoany_list" data-a2a-title="MISSION IMPOSSIBLE: Official story of Las Vegas shooting unravels; physical impossibility of lone gunman senior citizen makes narrative ludicrous" data-a2a-url="https://www.naturalnews.com/2017-10-02-lone-gunman-theory-of-las-vegas-shooter-is-complete-nonsense-stephen-paddock.html"><a class="a2a_button_facebook" href="http://www.addtoany.com/add_to/facebook?linkurl=https://www.naturalnews.com/2017-10-02-lone-gunman-theory-of-las-vegas-shooter-is-complete-nonsense-stephen-paddock.html&amp;linkname=MISSION%20IMPOSSIBLE%3A%20Official%20story%20of%20Las%20Vegas%20shooting%20unravels%3B%20physical%20impossibility%20of%20lone%20gunman%20senior%20citizen%20makes%20narrative%20ludicrous" rel="nofollow" target="_blank" title="Facebook"></a><a class="a2a_button_twitter" href="http://www.addtoany.com/add_to/twitter?linkurl=https://www.naturalnews.com

In [23]:
for url in media_urls:
    try:
        s = requests.Session()
        s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
        r = s.get(url)
        if not r.ok:
            print('Error fetching {}'.format(url))
            continue
        soup = BeautifulSoup(r.content, 'html')
        ad_soup = soup.find_all('div', id=re.compile('.*'),class_=re.compile('[^\w]ad[s|\w]?'))
        sponsor_soup = soup.find_all(text=re.compile('[sS]ponsor'))


        print(f'#ads: {len(ad_soup) + len(sponsor_soup)} {url}')
    except Exception as e:
        print('Error fetching {url}')

#ads: 3 https://www.independent.co.uk/news/world/asia/india-floods-bangladesh-nepal-deaths-millions-homeless-latest-news-updates-a7919006.html
Error fetching https://www.ntd.tv/inspiring/life/9-sleeping-positions-improve-health.html
#ads: 0 https://qz.com/1064364/hurricane-harvey-houstons-flooding-made-worse-by-unchecked-urban-development-and-wetland-destruction
#ads: 4 https://www.independent.co.uk/life-style/health-and-families/donald-trump-mental-illness-narcisissm-us-president-psychologists-inauguration-crowd-size-paranoia-a7552661.html
Error fetching https://info.cmsri.org/the-driven-researcher-blog/vaccinated-vs.-unvaccinated-guess-who-is-sicker
#ads: 1 https://medicalxpress.com/news/2017-08-reverse-aging-brain.html
#ads: 1 http://www.nationspressph.com/2017/02/pls-share-do-not-eat-this-fish-it-is.html
#ads: 0 https://dailyhealthpost.com/preventing-alzheimers/
Error fetching https://ewao.com/2017/09/16/six-pharmaceutical-medicines-that-instantly-make-your-health-worse/
#ads: 1 ht