In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
files = [
    'credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv',
#    'credco_webconf_study_2_study_2_project_1_2018_02_21t22_44_07_00_00_anon_nolink.csv',
#    'credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv'
]

In [3]:
file_paths = [os.path.join('credibilitycoalition-webconf-2018', 'data', file) for file in files]
print(file_paths)

['credibilitycoalition-webconf-2018/data/credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv']


In [4]:
data = []
labels = []
for file_path in file_paths:
    with open(file_path) as csvfile:
        csv_reader = csv.reader(csvfile)
        for idx, row in enumerate(csv_reader):
            if idx == 0:
                labels.append(row)
            if idx > 0:
                data.append(row)

In [5]:
# labels are of different size but the largest one is a superset of all the others
# so we'll use the labels array that is the largest in size
label = labels[np.argmax([len(label) for label in labels])]

In [6]:
# function will extract a column of data given the index
get_data_col = lambda data, idx: [col[idx] for col in data if idx < len(col)]

# function will extract data columns given titles from the master label
get_data_col_from_titles = lambda data, titles, label: [get_data_col(data, label.index(col)) for col in titles] 

# Get Report Data

In [7]:
import re

r = re.compile('report_title')
report_title_col = list(filter(r.match, label))
report_title_labels = get_data_col_from_titles(data, report_title_col, label)[0]
print('There are {} report_title_label columns'.format(len(report_title_labels)))

r = re.compile('media_content')
media_content_col = list(filter(r.match, label))
media_content_labels = get_data_col_from_titles(data, media_content_col, label)[0]
print('There are {} media_content_label columns'.format(len(media_content_labels)))

r = re.compile('media_url')
media_urls_col = list(filter(r.match, label))
media_urls_labels = get_data_col_from_titles(data, media_urls_col, label)[0]
print('There are {} media_urls_label columns'.format(len(media_urls_labels)))

### Credibility/Originality/Fact-Checked

r = re.compile('task_answer_1') #Very Low/Low/Medium/High/Very High
credibility_col = list(filter(r.match, label))
credibility_labels = get_data_col_from_titles(data, credibility_col, label)[0]

r = re.compile('task_answer_2') # original
originality_col = list(filter(r.match, label))
originality_labels = get_data_col_from_titles(data, originality_col, label)[0]

r = re.compile('task_answer_3') # only if task_answer_2 was not original
originality_attribution_col = list(filter(r.match, label))
originality_attribution_labels = get_data_col_from_titles(data, originality_attribution_col, label)[0]

r = re.compile('task_answer_4') # task_note_4 contains fact-checked source
fact_checked_col = list(filter(r.match, label))
fact_checked_labels = get_data_col_from_titles(data, fact_checked_col, label)[0]

### Source 1

r = re.compile('task_answer_5') # task_note_5 contains source's name
source_1_category_col = list(filter(r.match, label))
source_1_category_labels = get_data_col_from_titles(data, source_1_category_col, label)[0]

r = re.compile('task_answer_6') # True or False, notes contain link
source_1_link_provided_col = list(filter(r.match, label))
source_1_link_provided_labels = get_data_col_from_titles(data, source_1_link_provided_col, label)[0]

r = re.compile('task_answer_7') # scalar 0 - 100
source_1_impact_factor_if_found_col = list(filter(r.match, label))
source_1_impact_factor_if_found_labels = get_data_col_from_titles(data, source_1_impact_factor_if_found_col, label)[0]

r = re.compile('task_answer_8') # somewhat/strongly agree/disagree/neutral
source_1_characterized_correctly_col = list(filter(r.match, label))
source_1_characterized_correctly_labels = get_data_col_from_titles(data, source_1_characterized_correctly_col, label)[0]

### Source 2

r = re.compile('task_answer_9') # task_note_5 contains source's name
source_2_category_col = list(filter(r.match, label))
source_2_category_labels = get_data_col_from_titles(data, source_2_category_col, label)[0]

r = re.compile('task_answer_10') # True or False, notes contain link
source_2_link_provided_col = list(filter(r.match, label))
source_2_link_provided_labels = get_data_col_from_titles(data, source_2_link_provided_col, label)[0]

r = re.compile('task_answer_11') # scalar 0 - 100
source_2_impact_factor_if_found_col = list(filter(r.match, label))
source_2_impact_factor_if_found_labels = get_data_col_from_titles(data, source_2_impact_factor_if_found_col, label)[0]

r = re.compile('task_answer_12') # somewhat/strongly agree/disagree/neutral
source_2_characterized_correctly_col = list(filter(r.match, label))
source_2_characterized_correctly_labels = get_data_col_from_titles(data, source_2_characterized_correctly_col, label)[0]

### Source 3

r = re.compile('task_answer_13') # task_note_5 contains source's name
source_3_category_col = list(filter(r.match, label))
source_3_category_labels = get_data_col_from_titles(data, source_3_category_col, label)[0]

r = re.compile('task_answer_14') # True or False, notes contain link
source_3_link_provided_col = list(filter(r.match, label))
source_3_link_provided_labels = get_data_col_from_titles(data, source_3_link_provided_col, label)[0]

r = re.compile('task_answer_15') # scalar 0 - 100
source_3_impact_factor_if_found_col = list(filter(r.match, label))
source_3_impact_factor_if_found_labels = get_data_col_from_titles(data, source_3_impact_factor_if_found_col, label)[0]

r = re.compile('task_answer_16') # somewhat/strongly agree/disagree/neutral
source_3_characterized_correctly_col = list(filter(r.match, label))
source_3_characterized_correctly_labels = get_data_col_from_titles(data, source_3_characterized_correctly_col, label)[0]

### Contextual

r = re.compile('task_answer_17') # scalar
num_ads_col = list(filter(r.match, label))
num_ads_labels = get_data_col_from_titles(data, num_ads_col, label)[0]

r = re.compile('task_answer_18') # scalar
num_content_recommendation_boxes_col = list(filter(r.match, label))
num_content_recommendation_boxes_labels = get_data_col_from_titles(data, num_content_recommendation_boxes_col, label)[0]

r = re.compile('task_answer_19') # scalar
num_links_sponsored_content_col = list(filter(r.match, label))
num_links_sponsored_content_labels = get_data_col_from_titles(data, num_links_sponsored_content_col, label)[0]

r = re.compile('task_answer_20') # scalar
num_calls_to_socal_shares_col = list(filter(r.match, label))
num_calls_to_socal_shares_labels = get_data_col_from_titles(data, num_calls_to_socal_shares_col, label)[0]

r = re.compile('task_answer_21') # scalar
num_calls_to_join_mailing_list_col = list(filter(r.match, label))
num_calls_to_join_mailing_list_labels = get_data_col_from_titles(data, num_calls_to_join_mailing_list_col, label)[0]

r = re.compile('task_answer_22') # somewhat/strongly agree/disagree/neutral
article_has_spammy_clickbaity_ads_col = list(filter(r.match, label))
article_has_spammy_clickbaity_ads_labels = get_data_col_from_titles(data, article_has_spammy_clickbaity_ads_col, label)[0]

r = re.compile('task_answer_23') # somewhat/strongly agree/disagree/neutral
article_has_aggressive_ads_social_shares_mailing_list_col = list(filter(r.match, label))
article_has_aggressive_ads_social_shares_mailing_list_labels = get_data_col_from_titles(data, article_has_aggressive_ads_social_shares_mailing_list_col, label)[0]

r = re.compile('task_answer_24') # somewhat/strongly agree/disagree/neutral
credibility_post_article_col = list(filter(r.match, label))
credibility_post_article_label = get_data_col_from_titles(data, credibility_post_article_col, label)[0]

There are 50 report_title_label columns
There are 50 media_content_label columns
There are 50 media_urls_label columns


In [8]:
data = np.array([
                 # article raw data
                 report_title_labels,
                 media_content_labels,
                 media_urls_labels,
                
                 # creditibility/originality/fact-checked
                 credibility_labels,
                 originality_labels,
                 originality_attribution_labels,
                 fact_checked_labels,
    
                 # source 1
                 source_1_category_labels,
                 source_1_link_provided_labels,
                 source_1_impact_factor_if_found_labels,
                 source_1_characterized_correctly_labels,
    
                 # source 2
                 source_2_category_labels,
                 source_2_link_provided_labels,
                 source_2_impact_factor_if_found_labels,
                 source_2_characterized_correctly_labels,
    
                 # source 3
                 source_3_category_labels,
                 source_3_link_provided_labels,
                 source_3_impact_factor_if_found_labels,
                 source_3_characterized_correctly_labels,
    
                 # contextual
                 num_ads_labels,
                 num_content_recommendation_boxes_labels,
                 num_links_sponsored_content_labels,
                 num_calls_to_socal_shares_labels,
                 num_calls_to_join_mailing_list_labels,
                 article_has_spammy_clickbaity_ads_labels,
                 article_has_aggressive_ads_social_shares_mailing_list_labels,
                 credibility_post_article_label
                ])
article_idx = ['Article {}'.format(idx+1) for idx in range(np.shape(data)[1])]
article_indicator_label = ['Title', 'Content', 'URL', 'Credibility Score', 'Originality Score', 'Attribution Rank', 'Fact-Checked',
                           'Source 1 Category', 'Source 1 Link Provided', 'Source 1 Impact Factor', 'Source 1 Characterized Correctly',
                           'Source 2 Category', 'Source 2 Link Provided', 'Source 2 Impact Factor', 'Source 2 Characterized Correctly',
                           'Source 3 Category', 'Source 3 Link Provided', 'Source 3 Impact Factor', 'Source 3 Characterized Correctly',
                           'Number of Ads', 'Number of Content Recommendations', 'Number of Sponsored Content', 'Number of Social Share Calls',
                           'Number Calls to Join Mailing List', 'Article has Spammy/Clickbaity Ads', 'Article has Aggressive Ads/Calls', 'Credibility Post Article'
                          ]
print('There are {} articles'.format(np.shape(data)[1]))

There are 50 articles


### Show the Selected Data from the dataset

In [9]:
dataset = pd.DataFrame(data, index=article_indicator_label, columns=article_idx)
display(dataset)

Unnamed: 0,Article 1,Article 2,Article 3,Article 4,Article 5,Article 6,Article 7,Article 8,Article 9,Article 10,...,Article 41,Article 42,Article 43,Article 44,Article 45,Article 46,Article 47,Article 48,Article 49,Article 50
Title,"Floods in India, Bangladesh and Nepal kill 1,2...",,Houston’s flooding shows what happens when you...,Psychologists believe Trump is showing signs o...,Vaccinated vs. Unvaccinated: Mawson Homeschool...,Dancing can reverse the signs of aging in the ...,Nations Press: PLS SHARE: DO NOT EAT THIS FISH...,"Preventing Alzheimer's Disease, Dementia & Cog...",Six Pharmaceutical Drugs That Immediately Dest...,"Under the GOP's health plan, sexual assault co...",...,"The oldest child is actually the smartest, stu...",DELINGPOLE: Global Warming Study Cancelled Bec...,Which Banana Would You Eat? Your Answer May Ha...,,,Opinion | How the Anti-Vaxxers Are Winning,Surgeon Gen.: Addiction Is A Chronic Brain Dis...,Study Finds Stevia Kills Lyme Disease Pathogen...,Diet drinks TRIPLE your risk of stroke and dem...,"The Amish, Who Don’t Get Vaccinated, Rarely Ge..."
Content,"At least 1,200 people have been killed and mil...",,The city's gung-ho approach to development has...,Hillary Clinton claimed Trump was “temperament...,The Mawson study is a groundbreaking study of ...,As we grow older we suffer a decline in mental...,,Preventing Alzheimer's naturally is as simple ...,Some pharmaceutical medicines can cause you im...,The bill would also mean insurers could consid...,...,"Birth order doesn't affect personality, but it...",A global warming research study in Canada has ...,"When we eat bananas, we do so because we are h...",,,A major measles outbreak in America is only a ...,The way forward includes needle exchanges and ...,Lyme disease is a very complicated disease to ...,"The Boston University study of almost 4,400 ad...",Do not talk about whether vaccines work or not...
URL,https://www.independent.co.uk/news/world/asia/...,https://www.ntd.tv/inspiring/life/9-sleeping-p...,https://qz.com/1064364/hurricane-harvey-housto...,https://www.independent.co.uk/life-style/healt...,https://info.cmsri.org/the-driven-researcher-b...,https://medicalxpress.com/news/2017-08-reverse...,http://www.nationspressph.com/2017/02/pls-shar...,https://dailyhealthpost.com/preventing-alzheim...,https://ewao.com/2017/09/16/six-pharmaceutical...,https://mic.com/articles/176092/under-the-gop-...,...,https://www.today.com/health/birth-order-first...,http://www.breitbart.com/big-government/2017/0...,http://goodfullness.net/which-banana-would-you...,https://www.ntd.tv/inspiring/parenting/sam-ber...,http://www.iflscience.com/health-and-medicine/...,https://www.nytimes.com/2017/02/08/opinion/how...,https://www.huffingtonpost.com/entry/vivek-mur...,https://www.collective-evolution.com/2017/01/2...,http://www.dailymail.co.uk/~/article-4429790/i...,https://worldtruth.tv/the-amish-who-dont-get-v...
Credibility Score,Somewhat high credibility,Somewhat low credibility,Very high credibility,Medium credibility,Somewhat low credibility,Medium credibility,Very low credibility,Somewhat low credibility,Very low credibility,Medium credibility,...,Somewhat high credibility,Very low credibility,Somewhat low credibility,Medium credibility,Somewhat low credibility,Very high credibility,Very high credibility,Somewhat low credibility,Somewhat low credibility,Very low credibility
Originality Score,A) Most likely original,A) Most likely original,A) Most likely original,A) Most likely original,A) Most likely original,D) A wholesale duplicate of another article,D) A wholesale duplicate of another article,A) Most likely original,A) Most likely original,"C) Extensive quoting from another source, with...",...,A) Most likely original,A) Most likely original,A) Most likely original,D) A wholesale duplicate of another article,A) Most likely original,A) Most likely original,A) Most likely original,"C) Extensive quoting from another source, with...",A) Most likely original,A) Most likely original
Attribution Rank,,,,,,D) Unclear which is the original,B) Attribution was given but was inaccurate,D) Unclear which is the original,D) Unclear which is the original,C) Attribution was given and was accurate,...,,,,D) Unclear which is the original,,,,A) Attribution was not given,,
Fact-Checked,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,C) Fact-checked and determined false,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,C) Fact-checked and determined false,C) Fact-checked and determined false,...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,A) No central claim,B) Most likely not fact-checked by an approved...,E) Fact-checked with unclear results,A) No central claim,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,C) Fact-checked and determined false
Source 1 Category,An organization,An organization,An organization,An organization,A scientific study,A scientific study,,An organization,An organization,An organization,...,A scientific study,An organization,,An organization,A scientific study,An organization,A scientific study,An organization,A scientific study,Autism Research Articles
Source 1 Link Provided,true,true,true,true,true,true,,true,true,true,...,true,true,,true,true,false,true,false,false,true
Source 1 Impact Factor,could not locate,could not locate,could not locate,could not locate,3.991 (may be out of date),3.634,,could not locate,6.99,could not locate,...,2.371,could not locate,,could not locate,19.309,Could not locate.,could not locate,Could not locate.,Could not locate.,Could not locate.


# Map Select Data to Values

In [10]:
credibility_label_to_value = {'Very low credibility': 0,
                              'Somewhat low credibility': 1,
                              'Medium credibility': 2,
                              'Somewhat high credibility': 3,
                              'Very high credibility': 4,
                              # post article credibility
                              'No change': 5,
                              '': 6
                             }
originality_label_to_value = {'A) Most likely original': 0,
                              'B) Appears to be a copy of one or more articles, with some portions different or remixed': 1,
                              'C) Extensive quoting from another source, with some original content': 2,
                              'D) A wholesale duplicate of another article': 3,
                             }
attribution_rank_to_value = {'': 0,
                             'A) Attribution was not given': 1,
                             'B) Attribution was given but was inaccurate': 2,
                             'C) Attribution was given and was accurate': 3,
                             'D) Unclear which is the original': 4
                            }
fact_checked_label_to_value = {'A) No central claim': 0,
                               'B) Most likely not fact-checked by an approved source': 1,
                               'C) Fact-checked and determined false': 2,
                               'D) Fact-checked and determined true': 3,
                               'E) Fact-checked with unclear results': 4,
                               'F) Fact-checked with mixed results': 5,
                              }
agreement_label_to_value = {'Strongly disagree': 0,
                            'Somewhat disagree': 1,
                            'Neutral': 2,
                            'Somewhat agree': 3,
                            'Strongly agree': 4,
                            'Unable to find source': 5,
                            'Source is behind a paywall': 6,
                            '': 7,
                           }
source_category_to_value = {'': 0, 'A person': 1, 'A scientific study': 2, 'An organization': 3}
source_link_provided_to_value = {'': 0, 'false': 1, 'true': 2}

## Remap the data in-place to alter the original data

In [11]:
dataset.loc['Credibility Score'] = [credibility_label_to_value[X] for X in dataset.loc['Credibility Score']]
dataset.loc['Originality Score'] = [originality_label_to_value[X] for X in dataset.loc['Originality Score']]
dataset.loc['Attribution Rank'] = [attribution_rank_to_value[X] for X in dataset.loc['Attribution Rank']]
dataset.loc['Fact-Checked'] = [fact_checked_label_to_value[X] for X in dataset.loc['Fact-Checked']]

dataset.loc['Source 1 Category'] = [source_category_to_value[X] if X in source_category_to_value else 0 for X in dataset.loc['Source 1 Category']]
dataset.loc['Source 1 Link Provided'] = [source_link_provided_to_value[X] for X in dataset.loc['Source 1 Link Provided']]
#dataset.loc['Source 1 Impact Factor'] = [source_impact_factor_to_value[X] for X in dataset.loc['Source 1 Impact Factor']]
dataset.loc['Source 1 Characterized Correctly'] = [agreement_label_to_value[X] for X in dataset.loc['Source 1 Characterized Correctly']]

dataset.loc['Source 2 Category'] = [source_category_to_value[X] if X in source_category_to_value else 0 for X in dataset.loc['Source 2 Category']]
dataset.loc['Source 2 Link Provided'] = [source_link_provided_to_value[X] for X in dataset.loc['Source 2 Link Provided']]
#dataset.loc['Source 2 Impact Factor'] = [source_impact_factor_to_value[X] for X in dataset.loc['Source 2 Impact Factor']]
dataset.loc['Source 2 Characterized Correctly'] = [agreement_label_to_value[X] for X in dataset.loc['Source 2 Characterized Correctly']]

dataset.loc['Source 3 Category'] = [source_category_to_value[X] if X in source_category_to_value else 0 for X in dataset.loc['Source 3 Category']]
dataset.loc['Source 3 Link Provided'] = [source_link_provided_to_value[X] for X in dataset.loc['Source 3 Link Provided']]
#dataset.loc['Source 3 Impact Factor'] = [source_impact_factor_to_value[X] for X in dataset.loc['Source 3 Impact Factor']]
dataset.loc['Source 3 Characterized Correctly'] = [agreement_label_to_value[X] for X in dataset.loc['Source 3 Characterized Correctly']]

dataset.loc['Article has Spammy/Clickbaity Ads'] = [agreement_label_to_value[X] for X in dataset.loc['Article has Spammy/Clickbaity Ads']]
dataset.loc['Article has Aggressive Ads/Calls'] = [agreement_label_to_value[X] for X in dataset.loc['Article has Aggressive Ads/Calls']]

dataset.loc['Credibility Post Article'] = [credibility_label_to_value[X] for X in dataset.loc['Credibility Post Article']]

In [12]:
dataset.loc['Content'][0]

"At least 1,200 people have been killed and millions have been left homeless following\xa0devastating floods that have hit India, Bangladesh and Nepal, in one of the worst flooding disasters to have affected the region in years. International aid agencies said thousands of villages have been cut off by flooding with people being deprived of\xa0food and clean water for days.\xa0 South Asia\xa0suffers from frequent flooding during the monsoon season, which lasts\xa0from June to September, but authorities have said this year's floods have been much worse.\xa0"

### Display the new altered dataset

In [13]:
display(dataset)
dataset.to_csv('dataset.csv')

Unnamed: 0,Article 1,Article 2,Article 3,Article 4,Article 5,Article 6,Article 7,Article 8,Article 9,Article 10,...,Article 41,Article 42,Article 43,Article 44,Article 45,Article 46,Article 47,Article 48,Article 49,Article 50
Title,"Floods in India, Bangladesh and Nepal kill 1,2...",,Houston’s flooding shows what happens when you...,Psychologists believe Trump is showing signs o...,Vaccinated vs. Unvaccinated: Mawson Homeschool...,Dancing can reverse the signs of aging in the ...,Nations Press: PLS SHARE: DO NOT EAT THIS FISH...,"Preventing Alzheimer's Disease, Dementia & Cog...",Six Pharmaceutical Drugs That Immediately Dest...,"Under the GOP's health plan, sexual assault co...",...,"The oldest child is actually the smartest, stu...",DELINGPOLE: Global Warming Study Cancelled Bec...,Which Banana Would You Eat? Your Answer May Ha...,,,Opinion | How the Anti-Vaxxers Are Winning,Surgeon Gen.: Addiction Is A Chronic Brain Dis...,Study Finds Stevia Kills Lyme Disease Pathogen...,Diet drinks TRIPLE your risk of stroke and dem...,"The Amish, Who Don’t Get Vaccinated, Rarely Ge..."
Content,"At least 1,200 people have been killed and mil...",,The city's gung-ho approach to development has...,Hillary Clinton claimed Trump was “temperament...,The Mawson study is a groundbreaking study of ...,As we grow older we suffer a decline in mental...,,Preventing Alzheimer's naturally is as simple ...,Some pharmaceutical medicines can cause you im...,The bill would also mean insurers could consid...,...,"Birth order doesn't affect personality, but it...",A global warming research study in Canada has ...,"When we eat bananas, we do so because we are h...",,,A major measles outbreak in America is only a ...,The way forward includes needle exchanges and ...,Lyme disease is a very complicated disease to ...,"The Boston University study of almost 4,400 ad...",Do not talk about whether vaccines work or not...
URL,https://www.independent.co.uk/news/world/asia/...,https://www.ntd.tv/inspiring/life/9-sleeping-p...,https://qz.com/1064364/hurricane-harvey-housto...,https://www.independent.co.uk/life-style/healt...,https://info.cmsri.org/the-driven-researcher-b...,https://medicalxpress.com/news/2017-08-reverse...,http://www.nationspressph.com/2017/02/pls-shar...,https://dailyhealthpost.com/preventing-alzheim...,https://ewao.com/2017/09/16/six-pharmaceutical...,https://mic.com/articles/176092/under-the-gop-...,...,https://www.today.com/health/birth-order-first...,http://www.breitbart.com/big-government/2017/0...,http://goodfullness.net/which-banana-would-you...,https://www.ntd.tv/inspiring/parenting/sam-ber...,http://www.iflscience.com/health-and-medicine/...,https://www.nytimes.com/2017/02/08/opinion/how...,https://www.huffingtonpost.com/entry/vivek-mur...,https://www.collective-evolution.com/2017/01/2...,http://www.dailymail.co.uk/~/article-4429790/i...,https://worldtruth.tv/the-amish-who-dont-get-v...
Credibility Score,3,1,4,2,1,2,0,1,0,2,...,3,0,1,2,1,4,4,1,1,0
Originality Score,0,0,0,0,0,3,3,0,0,2,...,0,0,0,3,0,0,0,2,0,0
Attribution Rank,0,0,0,0,0,4,2,4,4,3,...,0,0,0,4,0,0,0,1,0,0
Fact-Checked,1,1,1,1,2,1,1,1,2,2,...,1,1,0,1,4,0,1,1,1,2
Source 1 Category,3,3,3,3,2,2,0,3,3,3,...,2,3,0,3,2,3,2,3,2,0
Source 1 Link Provided,2,2,2,2,2,2,0,2,2,2,...,2,2,0,2,2,1,2,1,1,2
Source 1 Impact Factor,could not locate,could not locate,could not locate,could not locate,3.991 (may be out of date),3.634,,could not locate,6.99,could not locate,...,2.371,could not locate,,could not locate,19.309,Could not locate.,could not locate,Could not locate.,Could not locate.,Could not locate.


# Code below this line is just random stuff and can be ignored

---

## Run sets on some of the answers to see all unique values provided

In [14]:
set(dataset.loc['Attribution Rank'])

{0, 1, 2, 3, 4}

In [15]:
set(dataset.loc['Source 1 Characterized Correctly'])

{0, 1, 2, 3, 4, 5, 7}

In [16]:
set(dataset.loc['Source 1 Impact Factor'])

{'',
 '13.585',
 '17.202',
 '17.303',
 '19.309',
 '2.371',
 '3.02',
 '3.634',
 '3.991 (may be out of date)',
 '5.117',
 '5.454',
 '6.99',
 '8.44',
 'Could not locate',
 'Could not locate.',
 'could not locate'}

In [17]:
set(dataset.loc['Source 1 Category'])

{0, 1, 2, 3}

In [18]:
set(dataset.loc['Source 1 Link Provided'])

{0, 1, 2}

In [19]:
set(dataset.loc['Fact-Checked'])

{0, 1, 2, 3, 4, 5}

In [20]:
set(dataset.loc['Originality Score'])

{0, 1, 2, 3}

In [21]:
set(dataset.loc['Article has Spammy/Clickbaity Ads'])

{0, 1, 2, 3, 4}

In [22]:
set(dataset.loc['Credibility Score'])

{0, 1, 2, 3, 4}

In [23]:
dataset['Article 1']

Title                                Floods in India, Bangladesh and Nepal kill 1,2...
Content                              At least 1,200 people have been killed and mil...
URL                                  https://www.independent.co.uk/news/world/asia/...
Credibility Score                                                                    3
Originality Score                                                                    0
Attribution Rank                                                                     0
Fact-Checked                                                                         1
Source 1 Category                                                                    3
Source 1 Link Provided                                                               2
Source 1 Impact Factor                                                could not locate
Source 1 Characterized Correctly                                                     2
Source 2 Category                          

In [24]:
import requests
#from bs4 import BeautifulSoup

invalid_urls = []
for url in set(media_urls):
    print('Fetching URL: {}'.format(url))
    try:
        s = requests.Session()
        s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
        r = s.get(url)
        if not r.ok:
            invalid_urls.append(url)
    except Exception as e:
        print('Error fetching: {}'.format(url))
        print(e)
        invalid_urls.append(url)
    finally:
        s.close()

NameError: name 'media_urls' is not defined

### Check Invalid URLs

In [None]:
#[print(invalid_url) for invalid_url in invalid_urls]
[print(invalid_url) for invalid_url in set(invalid_urls)]
print('Number of invalid urls: {}'.format(len(invalid_urls)))
print('Number of unique invalid urls: {}'.format(len(set(invalid_urls))))

In [None]:
media_urls_set = [url for url in set(media_urls)]
report_titles_set = [title for title in set(report_titles)]
media_content_set = [content for content in set(media_content)]

In [None]:
print(len(media_urls_set))
print(len(report_titles_set))
print(len(media_content_set))

In [None]:
# print csv data from invalid urls
for invalid_url in invalid_urls:
    idx = media_urls.index(invalid_url)
    print(media_urls[idx])
    print(report_titles[idx])
    print(media_content[idx])
    print('')

## Get Annotated Results for Number of Ads

In [None]:
r = re.compile('task_question_17')
ad_question_labels = list(filter(r.match, label))
print('There are {} ad_question_labels columns'.format(len(ad_question_labels)))
print(ad_question_labels)

r = re.compile('task_answer_17')
ad_answer_labels = list(filter(r.match, label))
print('There are {} ad_answer_labels columns'.format(len(ad_answer_labels)))
print(ad_answer_labels)

In [None]:
ad_question = get_data_col_from_titles(data, ad_question_labels, label)[0]
print('There are {} ad_question rows'.format(len(ad_question)))

ad_answer = get_data_col_from_titles(data, ad_answer_labels, label)[0]
print('There are {} ad_answers rows'.format(len(ad_answer)))

In [None]:
for title, url, n_ads in zip(report_titles, media_urls, ad_answer):
    print(n_ads, url)

# Count Ads

In [None]:
from bs4 import BeautifulSoup

url = media_urls[19]
print(url)

s = requests.Session()
s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
r = s.get(url)
if r.ok:
    print(r)

In [None]:
soup = BeautifulSoup(r.content, 'html')
#print(soup.prettify()) # print the parsed data of html

In [None]:
print(soup.prettify())

In [None]:
# assume that adds are wrapped in a <div> tag and has an `id` and a `class` attribute where the `class` contains the start of the word "ad" in it
#re_soup = soup.find_all('div', id=re.compile('.*'), class_=re.compile('[^\w]ad[s|\w]?'))
#re_soup = soup.find_all('div', class_=re.compile('[^\w][Aa][Dd]'))
#re_soup = soup.find_all('div', class_=re.compile('[Aa][Dd]'))
re_soup = soup.find_all(text=re.compile('[sS]ponsor'))

print('Found {} tags'.format(len(re_soup)))
for idx, result in enumerate(re_soup):
    print(idx, result)

In [None]:
# assume that adds are wrapped in a <div> tag and has an `id` and a `class` attribute where the `class` contains the start of the word "ad" in it
#re_soup = soup.find_all('div', id=re.compile('.*'), class_=re.compile('[^\w]ad[s|\w]?'))
re_soup = soup.find_all('div', class_=re.compile('[^\w][Aa][Dd]'))

print('Found {} tags'.format(len(re_soup)))
for idx, result in enumerate(re_soup):
    print(idx, result)

In [None]:
for url in media_urls:
    try:
        s = requests.Session()
        s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
        r = s.get(url)
        if not r.ok:
            print('Error fetching {}'.format(url))
            continue
        soup = BeautifulSoup(r.content, 'html')
        ad_soup = soup.find_all('div', id=re.compile('.*'),class_=re.compile('[^\w]ad[s|\w]?'))
        sponsor_soup = soup.find_all(text=re.compile('[sS]ponsor'))


        print(f'#ads: {len(ad_soup) + len(sponsor_soup)} {url}')
    except Exception as e:
        print('Error fetching {url}')