In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
files = [
    'credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv',
    'credco_webconf_study_2_study_2_project_1_2018_02_21t22_44_07_00_00_anon_nolink.csv',
    'credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv'
]

In [3]:
file_paths = [os.path.join('credibilitycoalition-webconf-2018', 'data', file) for file in files]
print(file_paths)

['credibilitycoalition-webconf-2018/data/credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv', 'credibilitycoalition-webconf-2018/data/credco_webconf_study_2_study_2_project_1_2018_02_21t22_44_07_00_00_anon_nolink.csv', 'credibilitycoalition-webconf-2018/data/credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv']


In [4]:
data = []
labels = []
for file_path in file_paths:
    with open(file_path) as csvfile:
        csv_reader = csv.reader(csvfile)
        for idx, row in enumerate(csv_reader):
            if idx == 0:
                labels.append(row)
            if idx > 0:
                data.append(row)

In [5]:
# labels are of different size but the largest one is a superset of all the others
# so we'll use the labels array that is the largest in size
label = labels[np.argmax([len(label) for label in labels])]

In [6]:
# function will extract a column of data given the index
get_data_col = lambda data, idx: [col[idx] for col in data if idx < len(col)]

# function will extract data columns given titles from the master label
get_data_col_from_titles = lambda data, titles, label: [get_data_col(data, label.index(col)) for col in titles] 

# Get Report Data

In [7]:
import re

r = re.compile('report_title')
report_title_labels = list(filter(r.match, label))
print('There are {} report_title_label columns'.format(len(report_title_labels)))
print(report_title_labels)

r = re.compile('media_content')
media_content_labels = list(filter(r.match, label))
print('There are {} media_content_label columns'.format(len(media_content_labels)))
print(media_content_labels)

r = re.compile('media_url')
media_urls_labels = list(filter(r.match, label))
print('There are {} media_urls_label columns'.format(len(media_urls_labels)))
print(media_urls_labels)

There are 1 report_title_label columns
['report_title']
There are 1 media_content_label columns
['media_content']
There are 1 media_urls_label columns
['media_url']


In [8]:
report_titles = get_data_col_from_titles(data, report_title_labels, label)[0]
print('There are {} report_title rows'.format(len(report_titles)))

media_content = get_data_col_from_titles(data, media_content_labels, label)[0]
print('There are {} media_content rows'.format(len(media_content)))

media_urls = get_data_col_from_titles(data, media_urls_labels, label)[0]
print('There are {} media_url rows'.format(len(media_urls)))

There are 150 report_title rows
There are 150 media_content rows
There are 150 media_url rows


In [9]:
print('There are {} unique report_title rows'.format(len(set(report_titles))))
print('There are {} unique media_content rows'.format(len(set(media_content))))
print('There are {} unique media_url rows'.format(len(set(media_urls))))

There are 47 unique report_title rows
There are 46 unique media_content rows
There are 50 unique media_url rows


In [10]:
import requests
#from bs4 import BeautifulSoup

invalid_urls = []
for url in set(media_urls):
    print('Fetching URL: {}'.format(url))
    try:
        s = requests.Session()
        s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
        r = s.get(url)
        if not r.ok:
            invalid_urls.append(url)
    except Exception as e:
        print('Error fetching: {}'.format(url))
        print(e)
        invalid_urls.append(url)
    finally:
        s.close()

Fetching URL: https://www.bbc.com/earth/story/20170504-there-are-diseases-hidden-in-ice-and-they-are-waking-up
Fetching URL: https://www.simplemost.com/new-study-says-putting-kids-to-bed-earlier-is-better-for-moms-sanity/
Fetching URL: http://publichealthabc.com/2-cups-day-1-week-stomach-will-flatter/
Fetching URL: https://info.cmsri.org/the-driven-researcher-blog/vaccinated-vs.-unvaccinated-guess-who-is-sicker
Fetching URL: http://www.iflscience.com/health-and-medicine/coconut-oil-bad/
Fetching URL: https://www.theguardian.com/environment/2017/may/19/arctic-stronghold-of-worlds-seeds-flooded-after-permafrost-melts
Fetching URL: http://www.iflscience.com/environment/heavy-monsoon-causing-extreme-flooding-in-south-asia-has-killed-1200-people-so-far/
Fetching URL: https://www.nytimes.com/2017/03/23/well/move/the-best-exercise-for-aging-muscles.html
Fetching URL: https://www.npr.org/sections/health-shots/2017/07/16/537075018/dirt-is-good-why-kids-need-exposure-to-germs
Fetching URL: https

KeyboardInterrupt: 

### Check Invalid URLs

In [None]:
#[print(invalid_url) for invalid_url in invalid_urls]
[print(invalid_url) for invalid_url in set(invalid_urls)]
print('Number of invalid urls: {}'.format(len(invalid_urls)))
print('Number of unique invalid urls: {}'.format(len(set(invalid_urls))))

In [None]:
media_urls_set = [url for url in set(media_urls)]
report_titles_set = [title for title in set(report_titles)]
media_content_set = [content for content in set(media_content)]

In [None]:
print(len(media_urls_set))
print(len(report_titles_set))
print(len(media_content_set))

In [None]:
# print csv data from invalid urls
for invalid_url in invalid_urls:
    idx = media_urls.index(invalid_url)
    print(media_urls[idx])
    print(report_titles[idx])
    print(media_content[idx])
    print('')

## Get Annotated Results for Number of Ads

In [48]:
r = re.compile('task_question_17')
ad_question_labels = list(filter(r.match, label))
print('There are {} ad_question_labels columns'.format(len(ad_question_labels)))
print(ad_question_labels)

r = re.compile('task_answer_17')
ad_answer_labels = list(filter(r.match, label))
print('There are {} ad_answer_labels columns'.format(len(ad_answer_labels)))
print(ad_answer_labels)

There are 1 ad_question_labels columns
['task_question_17']
There are 1 ad_answer_labels columns
['task_answer_17']


In [50]:
ad_question = get_data_col_from_titles(data, ad_question_labels, label)[0]
print('There are {} ad_question rows'.format(len(ad_question)))

ad_answer = get_data_col_from_titles(data, ad_answer_labels, label)[0]
print('There are {} ad_answers rows'.format(len(ad_answer)))

There are 150 ad_question rows
There are 150 ad_answers rows


In [54]:
for title, url, n_ads in zip(report_titles, media_urls, ad_answer):
    print(n_ads, url)

1 https://www.independent.co.uk/news/world/asia/india-floods-bangladesh-nepal-deaths-millions-homeless-latest-news-updates-a7919006.html
8 https://www.ntd.tv/inspiring/life/9-sleeping-positions-improve-health.html
4 https://qz.com/1064364/hurricane-harvey-houstons-flooding-made-worse-by-unchecked-urban-development-and-wetland-destruction
5 https://www.independent.co.uk/life-style/health-and-families/donald-trump-mental-illness-narcisissm-us-president-psychologists-inauguration-crowd-size-paranoia-a7552661.html
0 https://info.cmsri.org/the-driven-researcher-blog/vaccinated-vs.-unvaccinated-guess-who-is-sicker
5 https://medicalxpress.com/news/2017-08-reverse-aging-brain.html
4 http://www.nationspressph.com/2017/02/pls-share-do-not-eat-this-fish-it-is.html
20 https://dailyhealthpost.com/preventing-alzheimers/
4 https://ewao.com/2017/09/16/six-pharmaceutical-medicines-that-instantly-make-your-health-worse/
4 https://mic.com/articles/176092/under-the-gop-s-health-plan-sexual-assault-would-b

# Count Ads

In [21]:
from bs4 import BeautifulSoup

url = media_urls[3]
print(url)

s = requests.Session()
s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
r = s.get(url)
if r.ok:
    print(r)

https://www.independent.co.uk/life-style/health-and-families/donald-trump-mental-illness-narcisissm-us-president-psychologists-inauguration-crowd-size-paranoia-a7552661.html
<Response [200]>


In [25]:
soup = BeautifulSoup(r.content, 'html')
#print(soup.prettify()) # print the parsed data of html

In [43]:
# assume that adds are wrapped in a <div> tag and has the start of the word "ad" in it
re_soup = soup.find_all('div', class_=re.compile('[^\w]ad'))

print('Found {} tags'.format(len(re_soup)))
for idx, result in enumerate(re_soup):
    print(idx, result)

Found 12 tags
0 <div class="ad-wrapper ad-wrapper--mobile ad-wrapper--mpu0" id="mpu0ArticleBody">
</div>
1 <div class="ad-wrapper ad-wrapper--teads" style="margin-bottom: 20px"><div id="teads-inread"></div></div>
2 <div class="ad-wrapper ad-wrapper--mobile" id="teads-amp-ad">
<amp-ad class="desktop-hidden ad-center i-amphtml-layout-responsive i-amphtml-layout-size-defined" data-slot="/71347885/_main_independent/in_life_style/in_health_and_families/in_health_and_families_article" data-use-same-domain-rendering-until-deprecated="" height="1" i-amphtml-layout="responsive" json='{"targeting":{"tile":"teads","gs_channels":["esi_safe","safe_from_nestle_blacklist","safe_from_emirates_blacklist","shadow9hu7_safe_from_essence_blacklist","gv_safe","gv_safe_adult","gv_safe_arms","gv_safe_crime","gv_safe_death_injury","gv_safe_download","gv_safe_drugs","gv_safe_hatespeech","gv_safe_military","gv_safe_obscenity","gv_safe_terrorism","gv_safe_tobacco","pos_amazon_carnival_row_01","pos_amazon_carnival