In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
files = [
    'credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv',
#    'credco_webconf_study_2_study_2_project_1_2018_02_21t22_44_07_00_00_anon_nolink.csv',
#    'credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv'
]

In [3]:
file_paths = [os.path.join('credibilitycoalition-webconf-2018', 'data', file) for file in files]
print(file_paths)

['credibilitycoalition-webconf-2018/data/credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv']


In [4]:
data = []
labels = []
for file_path in file_paths:
    with open(file_path) as csvfile:
        csv_reader = csv.reader(csvfile)
        for idx, row in enumerate(csv_reader):
            if idx == 0:
                labels.append(row)
            if idx > 0:
                data.append(row)

In [5]:
# labels are of different size but the largest one is a superset of all the others
# so we'll use the labels array that is the largest in size
label = labels[np.argmax([len(label) for label in labels])]

In [6]:
# function will extract a column of data given the index
get_data_col = lambda data, idx: [col[idx] for col in data if idx < len(col)]

# function will extract data columns given titles from the master label
get_data_col_from_titles = lambda data, titles, label: [get_data_col(data, label.index(col)) for col in titles] 

# Get Report Data

In [7]:
import re

r = re.compile('report_title')
report_title_col = list(filter(r.match, label))
report_title_labels = get_data_col_from_titles(data, report_title_col, label)[0]
print('There are {} report_title_label columns'.format(len(report_title_labels)))

r = re.compile('media_content')
media_content_col = list(filter(r.match, label))
media_content_labels = get_data_col_from_titles(data, media_content_col, label)[0]
print('There are {} media_content_label columns'.format(len(media_content_labels)))

r = re.compile('media_url')
media_urls_col = list(filter(r.match, label))
media_urls_labels = get_data_col_from_titles(data, media_urls_col, label)[0]
print('There are {} media_urls_label columns'.format(len(media_urls_labels)))

### Credibility/Originality/Fact-Checked

r = re.compile('task_answer_1') #Very Low/Low/Medium/High/Very High
credibility_col = list(filter(r.match, label))
credibility_labels = get_data_col_from_titles(data, credibility_col, label)[0]

r = re.compile('task_answer_2') # original
originality_col = list(filter(r.match, label))
originality_labels = get_data_col_from_titles(data, originality_col, label)[0]

r = re.compile('task_answer_3') # only if task_answer_2 was not original
originality_attribution_col = list(filter(r.match, label))
originality_attribution_labels = get_data_col_from_titles(data, originality_attribution_col, label)[0]

r = re.compile('task_answer_4') # task_note_4 contains fact-checked source
fact_checked_col = list(filter(r.match, label))
fact_checked_labels = get_data_col_from_titles(data, fact_checked_col, label)[0]

### Source 1

r = re.compile('task_answer_5') # task_note_5 contains source's name
source_1_category_col = list(filter(r.match, label))
source_1_category_labels = get_data_col_from_titles(data, source_1_category_col, label)[0]

r = re.compile('task_answer_6') # True or False, notes contain link
source_1_link_provided_col = list(filter(r.match, label))
source_1_link_provided_labels = get_data_col_from_titles(data, source_1_link_provided_col, label)[0]

r = re.compile('task_answer_7') # scalar 0 - 100
source_1_impact_factor_if_found_col = list(filter(r.match, label))
source_1_impact_factor_if_found_labels = get_data_col_from_titles(data, source_1_impact_factor_if_found_col, label)[0]

r = re.compile('task_answer_8') # somewhat/strongly agree/disagree/neutral
source_1_characterized_correctly_col = list(filter(r.match, label))
source_1_characterized_correctly_labels = get_data_col_from_titles(data, source_1_characterized_correctly_col, label)[0]

### Source 2

r = re.compile('task_answer_9') # task_note_5 contains source's name
source_2_category_col = list(filter(r.match, label))
source_2_category_labels = get_data_col_from_titles(data, source_2_category_col, label)[0]

r = re.compile('task_answer_10') # True or False, notes contain link
source_2_link_provided_col = list(filter(r.match, label))
source_2_link_provided_labels = get_data_col_from_titles(data, source_2_link_provided_col, label)[0]

r = re.compile('task_answer_11') # scalar 0 - 100
source_2_impact_factor_if_found_col = list(filter(r.match, label))
source_2_impact_factor_if_found_labels = get_data_col_from_titles(data, source_2_impact_factor_if_found_col, label)[0]

r = re.compile('task_answer_12') # somewhat/strongly agree/disagree/neutral
source_2_characterized_correctly_col = list(filter(r.match, label))
source_2_characterized_correctly_labels = get_data_col_from_titles(data, source_2_characterized_correctly_col, label)[0]

### Source 3

r = re.compile('task_answer_13') # task_note_5 contains source's name
source_3_category_col = list(filter(r.match, label))
source_3_category_labels = get_data_col_from_titles(data, source_3_category_col, label)[0]

r = re.compile('task_answer_14') # True or False, notes contain link
source_3_link_provided_col = list(filter(r.match, label))
source_3_link_provided_labels = get_data_col_from_titles(data, source_3_link_provided_col, label)[0]

r = re.compile('task_answer_15') # scalar 0 - 100
source_3_impact_factor_if_found_col = list(filter(r.match, label))
source_3_impact_factor_if_found_labels = get_data_col_from_titles(data, source_3_impact_factor_if_found_col, label)[0]

r = re.compile('task_answer_16') # somewhat/strongly agree/disagree/neutral
source_3_characterized_correctly_col = list(filter(r.match, label))
source_3_characterized_correctly_labels = get_data_col_from_titles(data, source_3_characterized_correctly_col, label)[0]

### Contextual

r = re.compile('task_answer_17') # scalar
num_ads_col = list(filter(r.match, label))
num_ads_labels = get_data_col_from_titles(data, num_ads_col, label)[0]

r = re.compile('task_answer_18') # scalar
num_content_recommendation_boxes_col = list(filter(r.match, label))
num_content_recommendation_boxes_labels = get_data_col_from_titles(data, num_content_recommendation_boxes_col, label)[0]

r = re.compile('task_answer_19') # scalar
num_links_sponsored_content_col = list(filter(r.match, label))
num_links_sponsored_content_labels = get_data_col_from_titles(data, num_links_sponsored_content_col, label)[0]

r = re.compile('task_answer_20') # scalar
num_calls_to_socal_shares_col = list(filter(r.match, label))
num_calls_to_socal_shares_labels = get_data_col_from_titles(data, num_calls_to_socal_shares_col, label)[0]

r = re.compile('task_answer_21') # scalar
num_calls_to_join_mailing_list_col = list(filter(r.match, label))
num_calls_to_join_mailing_list_labels = get_data_col_from_titles(data, num_calls_to_join_mailing_list_col, label)[0]

r = re.compile('task_answer_22') # somewhat/strongly agree/disagree/neutral
article_has_spammy_clickbaity_ads_col = list(filter(r.match, label))
article_has_spammy_clickbaity_ads_labels = get_data_col_from_titles(data, article_has_spammy_clickbaity_ads_col, label)[0]

r = re.compile('task_answer_23') # somewhat/strongly agree/disagree/neutral
article_has_aggressive_ads_social_shares_mailing_list_col = list(filter(r.match, label))
article_has_aggressive_ads_social_shares_mailing_list_labels = get_data_col_from_titles(data, article_has_aggressive_ads_social_shares_mailing_list_col, label)[0]

r = re.compile('task_answer_24') # somewhat/strongly agree/disagree/neutral
credibility_post_article_col = list(filter(r.match, label))
credibility_post_article_label = get_data_col_from_titles(data, credibility_post_article_col, label)[0]

There are 50 report_title_label columns
There are 50 media_content_label columns
There are 50 media_urls_label columns


In [8]:
data = np.array([
                 # article raw data
                 report_title_labels,
                 media_content_labels,
                 media_urls_labels,
                
                 # creditibility/originality/fact-checked
                 credibility_labels,
                 originality_labels,
                 originality_attribution_labels,
                 fact_checked_labels,
    
                 # source 1
                 source_1_category_labels,
                 source_1_link_provided_labels,
                 source_1_impact_factor_if_found_labels,
                 source_1_characterized_correctly_labels,
    
                 # source 2
                 source_2_category_labels,
                 source_2_link_provided_labels,
                 source_2_impact_factor_if_found_labels,
                 source_2_characterized_correctly_labels,
    
                 # source 3
                 source_3_category_labels,
                 source_3_link_provided_labels,
                 source_3_impact_factor_if_found_labels,
                 source_3_characterized_correctly_labels,
    
                 # contextual
                 num_ads_labels,
                 num_content_recommendation_boxes_labels,
                 num_links_sponsored_content_labels,
                 num_calls_to_socal_shares_labels,
                 num_calls_to_join_mailing_list_labels,
                 article_has_spammy_clickbaity_ads_labels,
                 article_has_aggressive_ads_social_shares_mailing_list_labels,
                 credibility_post_article_label
                ])
article_idx = ['Article {}'.format(idx+1) for idx in range(np.shape(data)[1])]
article_indicator_label = ['Title', 'Content', 'URL', 'Credibility Score', 'Originality Score', 'Attribution Rank', 'Fact-Checked',
                           'Source 1 Category', 'Source 1 Link Provided', 'Source 1 Impact Factor', 'Source 1 Characterized Correctly',
                           'Source 2 Category', 'Source 2 Link Provided', 'Source 2 Impact Factor', 'Source 2 Characterized Correctly',
                           'Source 3 Category', 'Source 3 Link Provided', 'Source 3 Impact Factor', 'Source 3 Characterized Correctly',
                           'Number of Ads', 'Number of Content Recommendations', 'Number of Sponsored Content', 'Number of Social Share Calls',
                           'Number Calls to Join Mailing List', 'Article has Spammy/Clickbaity Ads', 'Article has Aggressive Ads/Calls', 'Credibility Post Article'
                          ]
print('There are {} articles'.format(np.shape(data)[1]))

There are 50 articles


### Show the Selected Data from the dataset

In [9]:
dataset = pd.DataFrame(data, index=article_indicator_label, columns=article_idx)
display(dataset)

Unnamed: 0,Article 1,Article 2,Article 3,Article 4,Article 5,Article 6,Article 7,Article 8,Article 9,Article 10,...,Article 41,Article 42,Article 43,Article 44,Article 45,Article 46,Article 47,Article 48,Article 49,Article 50
Title,Johns Hopkins Researcher Releases Shocking Rep...,"Floods in India, Bangladesh and Nepal kill 1,2...",Coconut oil isn't healthy. It's never been hea...,How Your Finger Shape Determines Your Personal...,"Drinking more coffee leads to a longer life, t...",Monsanto Is Scrambling To Bury This Breaking S...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,Resorts in Mexico suspected of drugging tourists,"There are diseases hidden in ice, and they are...",NOW IT’S OFFICIAL: FDA Announced That Vaccines...,...,Study Finds Stevia Kills Lyme Disease Pathogen...,Diet drinks TRIPLE your risk of stroke and dem...,Arctic’s Winter Sea Ice Drops to Its Lowest Re...,,An Iceberg the Size of Delaware Just Broke Awa...,Stevia Kills Lyme Disease Pathogen Better Than...,Which Banana Would You Eat? Your Answer May Ha...,Opinion | How the Anti-Vaxxers Are Winning,"The Amish, Who Don’t Get Vaccinated, Rarely Ge...",Measles making comeback in Texas as parents op...
Content,"In 2015, a whole new slew of flu vaccines foun...","At least 1,200 people have been killed and mil...","“We advise against the use of coconut oil,"" th...",Just look at the picture and then look at your...,A higher consumption of coffee is linked to a ...,I am frankly shocked this information is not m...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,The scene at the swim-up bar at the Mexican re...,"Long-dormant bacteria and viruses, trapped in ...",The FDA has published conclusive proof on thei...,...,Lyme disease is a very complicated disease to ...,"The Boston University study of almost 4,400 ad...",Much of the ice also appears to be thinner tha...,,A crack more than 120 miles long had developed...,A recent study published in the European Journ...,"When we eat bananas, we do so because we are h...",A major measles outbreak in America is only a ...,Do not talk about whether vaccines work or not...,A Texas medical researcher is raising concerns...
URL,https://ewao.com/2017/08/16/johns-hopkins-rese...,https://www.independent.co.uk/news/world/asia/...,https://www.usatoday.com/story/news/nation-now...,https://mysticalraven.com/health/5110/how-your...,https://www.cnn.com/2017/07/10/health/coffee-l...,https://foodbabe.com/2016/11/15/monsanto/,http://nowcheckthis.com/2017/03/23/confirmed-e...,http://www.wfaa.com/news/health/resorts-in-mex...,https://www.bbc.com/earth/story/20170504-there...,http://inshapetoday.com/now-official-fda-annou...,...,https://www.collective-evolution.com/2017/01/2...,http://www.dailymail.co.uk/~/article-4429790/i...,https://www.nytimes.com/2017/03/22/climate/arc...,http://www.iflscience.com/environment/heavy-mo...,https://www.nytimes.com/interactive/2017/06/09...,https://www.healthspiritbody.com/lyme-disease-...,http://goodfullness.net/which-banana-would-you...,https://www.nytimes.com/2017/02/08/opinion/how...,https://worldtruth.tv/the-amish-who-dont-get-v...,http://www.wate.com/news/national-world/measle...
Credibility Score,Somewhat low credibility,Very high credibility,Very high credibility,Very low credibility,Very high credibility,Very low credibility,Somewhat low credibility,Very high credibility,Somewhat high credibility,Somewhat low credibility,...,Medium credibility,Medium credibility,Very high credibility,Very low credibility,Very high credibility,Very low credibility,Somewhat low credibility,Very high credibility,Very low credibility,Medium credibility
Originality Score,A) Most likely original,B) Appears to be a copy of one or more article...,A) Most likely original,D) A wholesale duplicate of another article,,D) A wholesale duplicate of another article,A) Most likely original,D) A wholesale duplicate of another article,A) Most likely original,"C) Extensive quoting from another source, with...",...,B) Appears to be a copy of one or more article...,B) Appears to be a copy of one or more article...,A) Most likely original,B) Appears to be a copy of one or more article...,A) Most likely original,"C) Extensive quoting from another source, with...",B) Appears to be a copy of one or more article...,A) Most likely original,D) A wholesale duplicate of another article,D) A wholesale duplicate of another article
Attribution Rank,,C) Attribution was given and was accurate,,A) Attribution was not given,,A) Attribution was not given,A) Attribution was not given,C) Attribution was given and was accurate,,A) Attribution was not given,...,D) Unclear which is the original,A) Attribution was not given,,C) Attribution was given and was accurate,,A) Attribution was not given,D) Unclear which is the original,,A) Attribution was not given,C) Attribution was given and was accurate
Fact-Checked,D) Fact-checked and determined true,D) Fact-checked and determined true,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,D) Fact-checked and determined true,...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,B) Most likely not fact-checked by an approved...,A) No central claim,C) Fact-checked and determined false,B) Most likely not fact-checked by an approved...
Source 1 Category,A scientific study,An organization,An organization,Does not cite another source,A scientific study,An organization,An organization,An organization,An organization,An organization,...,An organization,A scientific study,A person,An organization,A scientific study,An organization,,A person,An organization,An organization
Source 1 Link Provided,true,true,true,false,true,false,false,true,true,false,...,true,false,false,true,false,false,,true,true,true
Source 1 Impact Factor,yournewswire.com,http://www.straitstimes.com,http://circ.ahajournals.org/,No additional sources,http://annals.org/,http://www.simplypureinc.com/,Harvard School of Public Health,https://www.jsonline.com/,BBC,https://dailymed.nlm.nih.gov,...,could not locate,could not locate,Could not locate,could not locate,could not locate,could not locate,,could not locate,Could not locate,could not locate


# Map Select Data to Values

In [10]:
credibility_label_to_value = {'Very low credibility': 0,
                              'Somewhat low credibility': 1,
                              'Medium credibility': 2,
                              'Somewhat high credibility': 3,
                              'Very high credibility': 4,
                              # post article credibility
                              'No change': 5,
                              '': 6
                             }
originality_label_to_value = {'A) Most likely original': 0,
                              'B) Appears to be a copy of one or more articles, with some portions different or remixed': 1,
                              'C) Extensive quoting from another source, with some original content': 2,
                              'D) A wholesale duplicate of another article': 3,
                              '': 4
                             }
attribution_rank_to_value = {'': 0,
                             'A) Attribution was not given': 1,
                             'B) Attribution was given but was inaccurate': 2,
                             'C) Attribution was given and was accurate': 3,
                             'D) Unclear which is the original': 4
                            }
fact_checked_label_to_value = {'A) No central claim': 0,
                               'B) Most likely not fact-checked by an approved source': 1,
                               'C) Fact-checked and determined false': 2,
                               'D) Fact-checked and determined true': 3,
                               'E) Fact-checked with unclear results': 4,
                               'F) Fact-checked with mixed results': 5,
                               '': 6
                              }
agreement_label_to_value = {'Strongly disagree': 0,
                            'Somewhat disagree': 1,
                            'Neutral': 2,
                            'Somewhat agree': 3,
                            'Strongly agree': 4,
                            'Unable to find source': 5,
                            'Source is behind a paywall': 6,
                            '': 7,
                           }
source_category_to_value = {'': 0, 'A person': 1, 'A scientific study': 2, 'An organization': 3}
source_link_provided_to_value = {'': 0, 'false': 1, 'true': 2}

## Remap the data in-place to alter the original data

In [11]:
dataset.loc['Credibility Score'] = [credibility_label_to_value[X] for X in dataset.loc['Credibility Score']]
dataset.loc['Originality Score'] = [originality_label_to_value[X] for X in dataset.loc['Originality Score']]
dataset.loc['Attribution Rank'] = [attribution_rank_to_value[X] for X in dataset.loc['Attribution Rank']]
dataset.loc['Fact-Checked'] = [fact_checked_label_to_value[X] for X in dataset.loc['Fact-Checked']]

dataset.loc['Source 1 Category'] = [source_category_to_value[X] if X in source_category_to_value else 0 for X in dataset.loc['Source 1 Category']]
dataset.loc['Source 1 Link Provided'] = [source_link_provided_to_value[X] for X in dataset.loc['Source 1 Link Provided']]
#dataset.loc['Source 1 Impact Factor'] = [source_impact_factor_to_value[X] for X in dataset.loc['Source 1 Impact Factor']]
dataset.loc['Source 1 Characterized Correctly'] = [agreement_label_to_value[X] for X in dataset.loc['Source 1 Characterized Correctly']]

dataset.loc['Source 2 Category'] = [source_category_to_value[X] if X in source_category_to_value else 0 for X in dataset.loc['Source 2 Category']]
dataset.loc['Source 2 Link Provided'] = [source_link_provided_to_value[X] for X in dataset.loc['Source 2 Link Provided']]
#dataset.loc['Source 2 Impact Factor'] = [source_impact_factor_to_value[X] for X in dataset.loc['Source 2 Impact Factor']]
dataset.loc['Source 2 Characterized Correctly'] = [agreement_label_to_value[X] for X in dataset.loc['Source 2 Characterized Correctly']]

dataset.loc['Source 3 Category'] = [source_category_to_value[X] if X in source_category_to_value else 0 for X in dataset.loc['Source 3 Category']]
dataset.loc['Source 3 Link Provided'] = [source_link_provided_to_value[X] for X in dataset.loc['Source 3 Link Provided']]
#dataset.loc['Source 3 Impact Factor'] = [source_impact_factor_to_value[X] for X in dataset.loc['Source 3 Impact Factor']]
dataset.loc['Source 3 Characterized Correctly'] = [agreement_label_to_value[X] for X in dataset.loc['Source 3 Characterized Correctly']]

dataset.loc['Article has Spammy/Clickbaity Ads'] = [agreement_label_to_value[X] for X in dataset.loc['Article has Spammy/Clickbaity Ads']]
dataset.loc['Article has Aggressive Ads/Calls'] = [agreement_label_to_value[X] for X in dataset.loc['Article has Aggressive Ads/Calls']]

dataset.loc['Credibility Post Article'] = [credibility_label_to_value[X] for X in dataset.loc['Credibility Post Article']]

### Scrape for ad and sponsored count

In [12]:
import requests
from bs4 import BeautifulSoup

media_urls = dataset.loc['URL'].values

# setup browser
s = requests.Session()
s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'

num_ads_dataset = []
invalid_url = []

for url, article_col in zip(media_urls, dataset.columns):
    try:
        r = s.get(url)
        if not r.ok:
            print(f'Error fetching {url}: {r}')
            invalid_url.append(url)
            continue
        soup = BeautifulSoup(r.content, 'html')
        
        # find number of ads
        ad_soup = soup.find_all('div', id=re.compile('.*'),class_=re.compile('[^\w]ad[s|\w]?'))
        
        # check if there is sponsored content
        sponsor_soup = soup.find_all(text=re.compile('[sS]ponsor'))
        
        num_ads_dataset.append([article_col, url, len(ad_soup) + len(sponsor_soup), len(sponsor_soup)])
        #print(f'#ads: {len(ad_soup) + len(sponsor_soup)} {url}')
    except Exception as e:
        print(f'Error fetching {url}: {e}')
        invalid_url.append(url)

Error fetching https://ewao.com/2017/08/16/johns-hopkins-researcher-releases-shocking-report-on-flu-vaccines/: <Response [523]>
Error fetching http://www.wfaa.com/news/health/resorts-in-mexico-suspected-of-drugging-tourists/458170922: <Response [404]>
Error fetching http://publichealthabc.com/2-cups-day-1-week-stomach-will-flatter/: <Response [404]>
Error fetching https://www.ntd.tv/inspiring/life/9-sleeping-positions-improve-health.html: <Response [404]>
Error fetching https://ewao.com/2017/09/16/six-pharmaceutical-medicines-that-instantly-make-your-health-worse/: <Response [523]>
Error fetching https://info.cmsri.org/the-driven-researcher-blog/vaccinated-vs.-unvaccinated-guess-who-is-sicker: <Response [404]>
Error fetching http://www.nationspressph.com/2017/02/pls-share-do-not-eat-this-fish-it-is.html: <Response [404]>
Error fetching https://www.ntd.tv/inspiring/parenting/sam-berns-boy-aging-disease-progeria-dies-17.html: <Response [404]>
Error fetching https://awarenessact.com/stop-

In [13]:
print(f'There are {len(invalid_url)} invalid urls when fetching ads')

# display ad counts from parser
n_ads_df = pd.DataFrame(num_ads_dataset, columns=['Article', 'media_url', 'Total Number of Ads', 'Sponsored Content'])
display(n_ads_df)

There are 12 invalid urls when fetching ads


Unnamed: 0,Article,media_url,Total Number of Ads,Sponsored Content
0,Article 2,https://www.independent.co.uk/news/world/asia/...,3,1
1,Article 3,https://www.usatoday.com/story/news/nation-now...,2,2
2,Article 4,https://mysticalraven.com/health/5110/how-your...,0,0
3,Article 5,https://www.cnn.com/2017/07/10/health/coffee-l...,9,2
4,Article 6,https://foodbabe.com/2016/11/15/monsanto/,1,1
5,Article 7,http://nowcheckthis.com/2017/03/23/confirmed-e...,0,0
6,Article 9,https://www.bbc.com/earth/story/20170504-there...,5,5
7,Article 10,http://inshapetoday.com/now-official-fda-annou...,0,0
8,Article 11,https://en.newsner.com/mom-s-warning-is-spread...,1,1
9,Article 12,https://www.littlethings.com/hand-food-and-mou...,4,2


In [14]:
# inject ad counts from scraper
for article_col, url, ad_count, sponsored_count in n_ads_df.values:
    dataset.loc['Number of Ads', article_col] = ad_count
    dataset.loc['Number of Sponsored Content', article_col] = sponsored_count

## Display new altered dataset

In [15]:
display(dataset)

# save dataset
dataset.to_csv(f'dataset_{files[0]}.csv')

Unnamed: 0,Article 1,Article 2,Article 3,Article 4,Article 5,Article 6,Article 7,Article 8,Article 9,Article 10,...,Article 41,Article 42,Article 43,Article 44,Article 45,Article 46,Article 47,Article 48,Article 49,Article 50
Title,Johns Hopkins Researcher Releases Shocking Rep...,"Floods in India, Bangladesh and Nepal kill 1,2...",Coconut oil isn't healthy. It's never been hea...,How Your Finger Shape Determines Your Personal...,"Drinking more coffee leads to a longer life, t...",Monsanto Is Scrambling To Bury This Breaking S...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,Resorts in Mexico suspected of drugging tourists,"There are diseases hidden in ice, and they are...",NOW IT’S OFFICIAL: FDA Announced That Vaccines...,...,Study Finds Stevia Kills Lyme Disease Pathogen...,Diet drinks TRIPLE your risk of stroke and dem...,Arctic’s Winter Sea Ice Drops to Its Lowest Re...,,An Iceberg the Size of Delaware Just Broke Awa...,Stevia Kills Lyme Disease Pathogen Better Than...,Which Banana Would You Eat? Your Answer May Ha...,Opinion | How the Anti-Vaxxers Are Winning,"The Amish, Who Don’t Get Vaccinated, Rarely Ge...",Measles making comeback in Texas as parents op...
Content,"In 2015, a whole new slew of flu vaccines foun...","At least 1,200 people have been killed and mil...","“We advise against the use of coconut oil,"" th...",Just look at the picture and then look at your...,A higher consumption of coffee is linked to a ...,I am frankly shocked this information is not m...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,The scene at the swim-up bar at the Mexican re...,"Long-dormant bacteria and viruses, trapped in ...",The FDA has published conclusive proof on thei...,...,Lyme disease is a very complicated disease to ...,"The Boston University study of almost 4,400 ad...",Much of the ice also appears to be thinner tha...,,A crack more than 120 miles long had developed...,A recent study published in the European Journ...,"When we eat bananas, we do so because we are h...",A major measles outbreak in America is only a ...,Do not talk about whether vaccines work or not...,A Texas medical researcher is raising concerns...
URL,https://ewao.com/2017/08/16/johns-hopkins-rese...,https://www.independent.co.uk/news/world/asia/...,https://www.usatoday.com/story/news/nation-now...,https://mysticalraven.com/health/5110/how-your...,https://www.cnn.com/2017/07/10/health/coffee-l...,https://foodbabe.com/2016/11/15/monsanto/,http://nowcheckthis.com/2017/03/23/confirmed-e...,http://www.wfaa.com/news/health/resorts-in-mex...,https://www.bbc.com/earth/story/20170504-there...,http://inshapetoday.com/now-official-fda-annou...,...,https://www.collective-evolution.com/2017/01/2...,http://www.dailymail.co.uk/~/article-4429790/i...,https://www.nytimes.com/2017/03/22/climate/arc...,http://www.iflscience.com/environment/heavy-mo...,https://www.nytimes.com/interactive/2017/06/09...,https://www.healthspiritbody.com/lyme-disease-...,http://goodfullness.net/which-banana-would-you...,https://www.nytimes.com/2017/02/08/opinion/how...,https://worldtruth.tv/the-amish-who-dont-get-v...,http://www.wate.com/news/national-world/measle...
Credibility Score,1,4,4,0,4,0,1,4,3,1,...,2,2,4,0,4,0,1,4,0,2
Originality Score,0,1,0,3,4,3,0,3,0,2,...,1,1,0,1,0,2,1,0,3,3
Attribution Rank,0,3,0,1,0,1,1,3,0,1,...,4,1,0,3,0,1,4,0,1,3
Fact-Checked,3,3,1,1,6,1,1,1,1,3,...,1,1,6,1,1,1,1,0,2,1
Source 1 Category,2,3,3,0,2,3,3,3,3,3,...,3,2,1,3,2,3,0,1,3,3
Source 1 Link Provided,2,2,2,1,2,1,1,2,2,1,...,2,1,1,2,1,1,0,2,2,2
Source 1 Impact Factor,yournewswire.com,http://www.straitstimes.com,http://circ.ahajournals.org/,No additional sources,http://annals.org/,http://www.simplypureinc.com/,Harvard School of Public Health,https://www.jsonline.com/,BBC,https://dailymed.nlm.nih.gov,...,could not locate,could not locate,Could not locate,could not locate,could not locate,could not locate,,could not locate,Could not locate,could not locate


# Code below this line is just random stuff and can be ignored

---

## Run sets on some of the answers to see all unique values provided

In [16]:
set(dataset.loc['Attribution Rank'])

{0, 1, 3, 4}

In [17]:
set(dataset.loc['Source 1 Characterized Correctly'])

{0, 1, 2, 3, 4, 5, 7}

In [18]:
set(dataset.loc['Source 1 Impact Factor'])

{'',
 '18.164',
 '19.309',
 '2.371',
 'BBC',
 'Cannot locate',
 'Could not locate',
 'Harvard School of Public Health',
 'NY Daily News',
 'No additional sources',
 'cdc.gov',
 'could not locate',
 'http://annals.org/',
 'http://circ.ahajournals.org/',
 'http://www.apa.org/',
 'http://www.independent.co.uk/',
 'http://www.simplypureinc.com/',
 'http://www.straitstimes.com',
 'https://dailymed.nlm.nih.gov',
 'https://en.newsner.com/',
 'https://info.cmsri.org/',
 'https://www.alz.org/',
 'https://www.amazon.com/Food-Facts-Myths-Healthy-Diets/dp/1542404436',
 'https://www.cdc.gov/',
 'https://www.frontiersin.org/',
 'https://www.hcfcd.org',
 'https://www.jsonline.com/',
 'https://www.reuters.com/',
 'https://www.simplemost.com',
 'https://www.wakehealth.edu/',
 'thefartsfact.com',
 'yournewswire.com'}

In [19]:
set(dataset.loc['Source 1 Category'])

{0, 1, 2, 3}

In [20]:
set(dataset.loc['Source 1 Link Provided'])

{0, 1, 2}

In [21]:
set(dataset.loc['Fact-Checked'])

{0, 1, 2, 3, 6}

In [22]:
set(dataset.loc['Originality Score'])

{0, 1, 2, 3, 4}

In [23]:
set(dataset.loc['Article has Spammy/Clickbaity Ads'])

{0, 1, 2, 3, 4, 7}

In [24]:
set(dataset.loc['Credibility Score'])

{0, 1, 2, 3, 4, 6}

In [25]:
dataset['Article 1']

Title                                Johns Hopkins Researcher Releases Shocking Rep...
Content                              In 2015, a whole new slew of flu vaccines foun...
URL                                  https://ewao.com/2017/08/16/johns-hopkins-rese...
Credibility Score                                                                    1
Originality Score                                                                    0
Attribution Rank                                                                     0
Fact-Checked                                                                         3
Source 1 Category                                                                    2
Source 1 Link Provided                                                               2
Source 1 Impact Factor                                                yournewswire.com
Source 1 Characterized Correctly                                                     4
Source 2 Category                          