In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
files = [
    'dataset_credco_webconf_study_1_study_1_project_1_2018_02_21t22_43_18_00_00_anon_nolink.csv',
    'dataset_credco_webconf_study_2_study_2_project_1_2018_02_21t22_44_07_00_00_anon_nolink.csv',
    'dataset_credco_webconf_study_3_study_3_project_1_2018_02_21t22_44_40_00_00_anon_nolink.csv'
]

In [3]:
# load all datasets
datasets = [pd.read_csv(file, index_col=0) for file in files]

Dataset articles are unordered for each annotator so we need to sort them.

Here we sort them based on the article URLs as they are consistent across all annotators

In [4]:
# create an array containing all the URLs
dataset_urls = []
for dataset in datasets:
    dataset_urls.extend(dataset.loc['URL'])

In [5]:
# verify that there are indeed 50 articles
print(f'Total number of URLs {len(dataset_urls)}')
print(f'Number of unique URLs {len(set(dataset_urls))}')

Total number of URLs 150
Number of unique URLs 50


Now that we've confirmed that there only exists 50 articles in all three of the datasets, let's sort them by their URL

In [6]:
sorted_datasets = [dataset.sort_values('URL', axis=1) for dataset in datasets]

In [7]:
display(sorted_datasets[0])

Unnamed: 0,Article 43,Article 13,Article 14,Article 30,Article 23,Article 42,Article 49,Article 36,Article 45,Article 7,...,Article 44,Article 46,Article 35,Article 38,Article 37,Article 28,Article 29,Article 39,Article 41,Article 12
Title,Which Banana Would You Eat? Your Answer May Ha...,NOW IT’S OFFICIAL: FDA Announced That Vaccines...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,WITH ONLY 2 CUPS A DAY FOR 1 WEEK YOUR STOMACH...,"Lead Developer of HPV Vaccines Comes Clean, Wa...",DELINGPOLE: Global Warming Study Cancelled Bec...,Diet drinks TRIPLE your risk of stroke and dem...,,,Nations Press: PLS SHARE: DO NOT EAT THIS FISH...,...,,Opinion | How the Anti-Vaxxers Are Winning,Arctic’s Winter Sea Ice Drops to Its Lowest Re...,The Best Exercise for Aging Muscles,An Iceberg the Size of Delaware Just Broke Awa...,Putting Kids To Bed Early Improves Mom's Healt...,If Everyone Ate Beans Instead of Beef,Arctic stronghold of world’s seeds flooded aft...,"The oldest child is actually the smartest, stu...",Coconut oil isn't healthy. It's never been hea...
Content,"When we eat bananas, we do so because we are h...",The FDA has published conclusive proof on thei...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,The desire of lots of people is to have a flat...,"Lead Developer of HPV Vaccines Comes Clean, Wa...",A global warming research study in Canada has ...,"The Boston University study of almost 4,400 ad...",,,,...,,A major measles outbreak in America is only a ...,Much of the ice also appears to be thinner tha...,Certain kinds of exercise may mitigate the eff...,A crack more than 120 miles long had developed...,A new study confirms why all those bedtime bat...,"With one dietary change, the U.S. could hypoth...",No seeds were lost but the ability of the rock...,"Birth order doesn't affect personality, but it...","“We advise against the use of coconut oil,"" th..."
URL,http://goodfullness.net/which-banana-would-you...,http://inshapetoday.com/now-official-fda-annou...,http://nowcheckthis.com/2017/03/23/confirmed-e...,http://publichealthabc.com/2-cups-day-1-week-s...,http://www.alternativenewsnetwork.net/lead-dev...,http://www.breitbart.com/big-government/2017/0...,http://www.dailymail.co.uk/~/article-4429790/i...,http://www.iflscience.com/environment/heavy-mo...,http://www.iflscience.com/health-and-medicine/...,http://www.nationspressph.com/2017/02/pls-shar...,...,https://www.ntd.tv/inspiring/parenting/sam-ber...,https://www.nytimes.com/2017/02/08/opinion/how...,https://www.nytimes.com/2017/03/22/climate/arc...,https://www.nytimes.com/2017/03/23/well/move/t...,https://www.nytimes.com/interactive/2017/06/09...,https://www.simplemost.com/new-study-says-putt...,https://www.theatlantic.com/health/archive/201...,https://www.theguardian.com/environment/2017/m...,https://www.today.com/health/birth-order-first...,https://www.usatoday.com/story/news/nation-now...
Credibility Score,1,0,1,0,0,0,1,0,1,0,...,2,4,4,3,4,2,4,4,3,3
Originality Score,0,3,0,3,3,0,0,2,0,3,...,3,0,0,0,0,0,0,0,0,0
Attribution Rank,0,2,0,4,3,0,0,4,0,2,...,4,0,0,0,0,0,0,0,0,0
Fact-Checked,0,2,5,1,2,1,1,3,4,1,...,1,0,3,1,3,1,1,3,1,4
Source 1 Category,0,3,2,0,1,3,2,0,2,0,...,3,3,3,2,3,2,2,3,2,2
Source 1 Link Provided,0,1,1,0,1,2,1,0,2,0,...,2,1,1,2,1,1,2,2,2,2
Source 1 Impact Factor,,could not locate,8.44,,could not locate,could not locate,Could not locate.,,19.309,,...,could not locate,Could not locate.,Could not locate,17.303,Could not locate.,could not locate,5.454,could not locate,2.371,19.309


In [8]:
# Generate final dataset with initial Title, Content, and URL
merged_dataset = sorted_datasets[0].loc['Title':'URL'].copy()

In [9]:
indicies_to_add = ['Credibility Score',
                   'Originality Score',
                   'Attribution Rank',
                   'Source 1 Category',
                   'Source 2 Category',
                   'Source 3 Category',
                   'Article has Spammy/Clickbaity Ads',
                   'Article has Aggressive Ads/Calls',
                   
                   # specific numbers vary across annotators and building a scraper
                   # for each to cover all the supplied articles is non-trivial.
                   # so for now we'll use the annotator's labels for the time-being
                   'Number of Content Recommendations',
                   'Number of Social Share Calls',
                   'Number Calls to Join Mailing List',
                  ]

# get agreement result from labels noted in 'indicies to add'
for index in indicies_to_add:
    dataset_scores = []
    for dataset in sorted_datasets:
        # labels for last three labels sometimes supplies NaN,
        # fill with zeros here since NaN for a count is zero
        # will need to double-check data when adding additional labels
        # to see if this applies to those labels as well
        score = dataset.loc[index].fillna(0).values
        
        # convert text to int and add to array
        dataset_scores.append(np.array(list(map(int, score))))

    # compute average from each dataset
    average_score = np.mean(dataset_scores, axis=0)
    #print(average_score)

    # round score to get agreed annotator score
    rounded_average_score = np.round_(average_score)
    #print(rounded_average_score)

    # add to dataset
    merged_dataset.loc[index] = rounded_average_score

In [10]:
# copy over scraped values since there is no dispute here as it is not obtained from annotators
merged_dataset.loc['Number of Ads'] = sorted_datasets[0].loc['Number of Ads'].copy()
merged_dataset.loc['Number of Sponsored Content'] = sorted_datasets[0].loc['Number of Sponsored Content'].copy()

## Display and save final dataset

In [11]:
display(merged_dataset)
merged_dataset.to_csv('final_dataset.csv')

Unnamed: 0,Article 43,Article 13,Article 14,Article 30,Article 23,Article 42,Article 49,Article 36,Article 45,Article 7,...,Article 44,Article 46,Article 35,Article 38,Article 37,Article 28,Article 29,Article 39,Article 41,Article 12
Title,Which Banana Would You Eat? Your Answer May Ha...,NOW IT’S OFFICIAL: FDA Announced That Vaccines...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,WITH ONLY 2 CUPS A DAY FOR 1 WEEK YOUR STOMACH...,"Lead Developer of HPV Vaccines Comes Clean, Wa...",DELINGPOLE: Global Warming Study Cancelled Bec...,Diet drinks TRIPLE your risk of stroke and dem...,,,Nations Press: PLS SHARE: DO NOT EAT THIS FISH...,...,,Opinion | How the Anti-Vaxxers Are Winning,Arctic’s Winter Sea Ice Drops to Its Lowest Re...,The Best Exercise for Aging Muscles,An Iceberg the Size of Delaware Just Broke Awa...,Putting Kids To Bed Early Improves Mom's Healt...,If Everyone Ate Beans Instead of Beef,Arctic stronghold of world’s seeds flooded aft...,"The oldest child is actually the smartest, stu...",Coconut oil isn't healthy. It's never been hea...
Content,"When we eat bananas, we do so because we are h...",The FDA has published conclusive proof on thei...,CONFIRMED: E-CIGARETTES CAUSE A HORRIBLE INCUR...,The desire of lots of people is to have a flat...,"Lead Developer of HPV Vaccines Comes Clean, Wa...",A global warming research study in Canada has ...,"The Boston University study of almost 4,400 ad...",,,,...,,A major measles outbreak in America is only a ...,Much of the ice also appears to be thinner tha...,Certain kinds of exercise may mitigate the eff...,A crack more than 120 miles long had developed...,A new study confirms why all those bedtime bat...,"With one dietary change, the U.S. could hypoth...",No seeds were lost but the ability of the rock...,"Birth order doesn't affect personality, but it...","“We advise against the use of coconut oil,"" th..."
URL,http://goodfullness.net/which-banana-would-you...,http://inshapetoday.com/now-official-fda-annou...,http://nowcheckthis.com/2017/03/23/confirmed-e...,http://publichealthabc.com/2-cups-day-1-week-s...,http://www.alternativenewsnetwork.net/lead-dev...,http://www.breitbart.com/big-government/2017/0...,http://www.dailymail.co.uk/~/article-4429790/i...,http://www.iflscience.com/environment/heavy-mo...,http://www.iflscience.com/health-and-medicine/...,http://www.nationspressph.com/2017/02/pls-shar...,...,https://www.ntd.tv/inspiring/parenting/sam-ber...,https://www.nytimes.com/2017/02/08/opinion/how...,https://www.nytimes.com/2017/03/22/climate/arc...,https://www.nytimes.com/2017/03/23/well/move/t...,https://www.nytimes.com/interactive/2017/06/09...,https://www.simplemost.com/new-study-says-putt...,https://www.theatlantic.com/health/archive/201...,https://www.theguardian.com/environment/2017/m...,https://www.today.com/health/birth-order-first...,https://www.usatoday.com/story/news/nation-now...
Credibility Score,1,0,1,0,0,1,1,0,1,0,...,2,4,4,3,4,2,4,4,3,3
Originality Score,0,2,0,1,3,1,0,2,0,2,...,2,0,0,0,0,0,0,0,0,0
Attribution Rank,1,2,0,1,3,1,0,2,0,2,...,3,0,0,0,0,0,0,0,0,0
Source 1 Category,0,2,2,0,1,3,2,2,2,0,...,3,2,2,2,3,2,3,3,2,2
Source 2 Category,0,1,0,0,1,3,3,2,3,0,...,3,3,2,1,2,2,2,2,2,2
Source 3 Category,0,1,1,0,1,3,1,2,3,0,...,3,1,0,0,1,2,2,2,1,3
Article has Spammy/Clickbaity Ads,2,3,1,3,2,4,4,3,4,4,...,3,1,0,0,0,4,1,1,3,2
