In [1]:
## Author: Mirjam Nanko
## Date Created: 2021-11-29
## Email: m.nanko@exeter.ac.uk

# babyCARDS data preparation<br>
#### This script loads and combines the **convinced** (0) and **contrarian** (1) data sourced from various **blogs, twitter, facebook and newspapers**, and splits it into **training, validation and testing** data sets. 
#### To get some measure of external validity, the testing data set is a "pure" held out data set that only contains text from "unseen" sources, i.e. bloggers, twitter accounts, facebook accounts and newspaper articles that the classifier was not trained on.

# Packages & functions

In [3]:
import os
import random
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk

In [4]:
def train_valid_test_split(data, account_variable,
                           random_test_sample_size = .15, 
                           valid_sample_size = .25, 
                           random_seed = 1, random_state = 1, 
                           shuffle = True):
    """Split the data into training, validation and testing data set, with the testing 
    data being completely held-out data by unseen accounts"""
    
    # Set a random seed
    random.seed(random_seed)
    
    # Extract unique usernames for each side
    contrarian_accounts = data[data["label"]==1][account_variable].unique()
    convinced_accounts = data[data["label"]==0][account_variable].unique()
    
    # Create a randomn sample of users for the testing set for each side
    samplesize_contrarian = int(round(random_test_sample_size*len(contrarian_accounts),0))
    test_contrarian = random.sample(list(contrarian_accounts), samplesize_contrarian)
    samplesize_convinced = int(round(random_test_sample_size*len(convinced_accounts),0))
    test_convinced = random.sample(list(convinced_accounts), samplesize_convinced)
    
    # Separate the testing dataset
    trainvalid = pd.concat([data[~data[account_variable].isin(test_contrarian + test_convinced)]])
    test = pd.concat([data[data[account_variable].isin(test_contrarian + test_convinced)]])
    
    # Split training data into training and validation dataset
    train, valid = train_test_split(trainvalid, test_size=valid_sample_size, 
                                    random_state=random_state, shuffle=shuffle)
    print("Training data set with {} posts created.".format(len(train)))
    print("Validation data set with {} posts created.".format(len(valid)))    
    print("Testing data set with {} posts created.".format(len(test)))    

    return train, valid, test

def load_tweets(folderpath, label, verbose = True):
    """This function loads twitter json files into a pandas dataframe and assigns them a
    numeric label."""
    tweets, username, name, date, time = [[] for _ in range(5)]
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        for line in open('/'.join([folderpath, file]), 'r'):
            tweets.append(json.loads(line)['tweet'])
            username.append(json.loads(line)['username'])
            name.append(json.loads(line)['name'])
            date.append(json.loads(line)['date'])
            time.append(json.loads(line)['time'])
    df = pd.DataFrame(list(zip(tweets, [label]*len(tweets), username, name, date, time)),
                    columns = ['text', 'label', 'username', 'name', 'date', 'time'])
    if verbose == True:
        print("\n")
    return df

def load_facebook(folderpath, label, verbose = True):
    """This function loads facebook json files into a pandas dataframe and assigns them a
    numeric label."""
    text, handle, ID, name, pageAdminTopCountry, pageCategory, date, postUrl, link = [[] for _ in range(9)]
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        posts = json.load(open('/'.join([folderpath, file])))['posts']
        for post in posts:
            text.append(post.get('message'))
            handle.append(post['account'].get('handle'))
            ID.append(post['account'].get('id'))
            name.append(post['account'].get('name'))
            pageAdminTopCountry.append(post['account'].get('pageAdminTopCountry'))
            pageCategory.append(post['account'].get('pageCategory'))
            date.append(post.get('date'))
            postUrl.append(post.get('postUrl'))
            link.append(post.get('link'))     
    df = pd.DataFrame(list(zip(text, [label]*len(text), handle, ID, name, pageAdminTopCountry, pageCategory, date, postUrl, link)),
                    columns = ['text', 'label', 'handle', 'ID', 'name', 'pageAdminTopCountry', 'pageCategory', 'date', 'postUrl', 'link'])
    df = df.dropna(subset=['text'])
    if verbose == True:
        print("\n")
    return df

def load_news(folderpath, label, verbose = True):
    """This function loads factive news csv files into a pandas dataframe and assigns them a
    numeric label."""
    temp = []
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        df_temp = pd.read_csv('/'.join([folderpath, file]), encoding = 'utf-8', lineterminator = '\n', low_memory = False)
        df_temp.drop(columns=['SC', 'CY', 'RE', 'PUB', 'NS', 'CR', 'IPD', 'IPC', 'CT', 'IN', 'RF', 'LA', 'CO', 'ET', 'ED', 'PG'], inplace = True)
        df_temp.rename(columns = {'AN':'accession_number','SE': 'section', 'HD': 'headline','WC':'wordcount', 'PD': 'date','SN': 'source',
                            'LP': 'lead', 'TD': 'text', 'BY':'author'}, inplace=True)
        df_temp["ID"] = df_temp['accession_number'].str.extract("Document (.+?)_\d*")
        df_temp["label"] = label
        temp.append(df_temp)
    df = pd.concat(temp, axis=0, ignore_index=True)
    if verbose == True:
        print("\n")
    return df

# Load data

### Blogs

In [5]:
print('---Load the blog posts (by blogs, thinktanks and NGOs)---')
blogs = pd.read_csv('../data/blogs/baby_content_cleanest.csv', encoding = 'utf-8', lineterminator='\n', 
                   usecols = ['org', 'date', 'title', 'url', 'org_type', 'org_side', 'text'])
blogs["label"] = [1 if i=="contrarian" else 0 for i in blogs.org_side]
print("\n", len(blogs[blogs.label == 1]), "contrarian blog posts loaded.")
print("\n", len(blogs[blogs.label == 0]), "convinced blog posts loaded.\n\n")

print('---Split the data (test data: unseen organisationss)---\n')
# blogs = blogs.rename(columns={"org_type": "type"})
blogs['type'] = "blog/thinktank/ngo"
blogs_train, blogs_valid, blogs_test = train_valid_test_split(blogs, "org") # Here the held out data is unseen bloggers.

---Load the blog posts (by blogs, thinktanks and NGOs)---

 219535 contrarian blog posts loaded.

 110773 convinced blog posts loaded.


---Split the data (test data: unseen organisationss)---

Training data set with 221757 posts created.
Validation data set with 73920 posts created.
Testing data set with 34631 posts created.


### Twitter

In [6]:
print('---Load the contrarian tweets---\n')
twitter_contrarian = load_tweets('../data/twitter/contrarian', label = 1, verbose = False)
print(len(twitter_contrarian), "contrarian tweets loaded.")

print('\n\n---Load the convinced tweets---\n')
twitter_convinced = load_tweets('../data/twitter/convinced', label = 0, verbose = False)
print(len(twitter_convinced), "convinced tweets loaded.\n\n")

print('---Split the data (test data: unseen twitter handles)---\n')
twitter = pd.concat([twitter_contrarian, twitter_convinced])
twitter['type'] = "twitter"
twitter_train, twitter_valid, twitter_test = train_valid_test_split(twitter, "username") # Here the held out data is unseen twitter accounts.

---Load the contrarian tweets---

1966699 contrarian tweets loaded.


---Load the convinced tweets---

1573621 convinced tweets loaded.


---Split the data (test data: unseen twitter handles)---

Training data set with 2431705 posts created.
Validation data set with 810569 posts created.
Testing data set with 298046 posts created.


### Facebook

In [7]:
print('---Load the contrarian facebook posts---\n')
facebook_contrarian = load_facebook('../data/facebook/contrarian', label = 1, verbose = False)
print(len(facebook_contrarian), "contrarian facebook posts loaded.")
# 449199 contrarian facebook posts loaded.

print('\n\n---Load the convinced facebook posts---\n')
facebook_convinced = load_facebook('../data/facebook/convinced', label = 0, verbose = False)
print(len(facebook_convinced), "convinced facebook posts loaded.\n\n")

print('---Split the data (test data: unseen facebook handles)---\n')
facebook = pd.concat([facebook_contrarian, facebook_convinced])
facebook['type'] = "facebook"
facebook_train, facebook_valid, facebook_test = train_valid_test_split(facebook, "handle") # Here the held out data is unseen facebook accounts.

---Load the contrarian facebook posts---

425551 contrarian facebook posts loaded.


---Load the convinced facebook posts---

527951 convinced facebook posts loaded.


---Split the data (test data: unseen facebook handles)---

Training data set with 663504 posts created.
Validation data set with 221168 posts created.
Testing data set with 68830 posts created.


### Newspapers

In [8]:
print('---Load the contrarian news articles---\n')
newspaper_contrarian =  load_news("../data/newspapers/contrarian", label = 1, verbose = False)
print(len(newspaper_contrarian), "contrarian news article sentences loaded.")

print('\n\n---Load the convinced news articles---\n')
newspaper_convinced =  load_news("../data/newspapers/convinced", label = 0, verbose = False)
print(len(newspaper_convinced), "convinced news article sentences loaded.\n\n")

print('---Split the data (test data: unseen newspaper articles)---\n')
newspapers = pd.concat([newspaper_contrarian, newspaper_convinced])
newspapers = newspapers.groupby(['headline', 'wordcount', 'date', 'source', 'lead', 'time_frame', 'ID', 'label'])['text'].apply(' '.join).reset_index()
newspapers[newspapers.select_dtypes(['object']).columns] = newspapers[newspapers.select_dtypes(['object']).columns].apply(lambda x: x.str.replace("(.\\'.{2}.\\'.{2})", ""))
newspapers['type'] = "newspaper"
newspapers_train, newspapers_valid, newspapers_test = train_valid_test_split(newspapers, "ID") # Here the held out data is unseen newspaper articles.

---Load the contrarian news articles---

141954 contrarian news article sentences loaded.


---Load the convinced news articles---

118665 convinced news article sentences loaded.


---Split the data (test data: unseen newspaper articles)---

Training data set with 5723 posts created.
Validation data set with 1908 posts created.
Testing data set with 1346 posts created.


# Combine & export data

In [9]:
train = pd.concat([blogs_train[['text', 'label', 'type']],
                  twitter_train[['text', 'label', 'type']],
                  facebook_train[['text', 'label', 'type']],
                  newspapers_train[['text', 'label', 'type']]]).reset_index(drop = True)
train

Unnamed: 0,text,label,type
0,From left-wing activist in Montreal and Toront...,1,blog/thinktank/ngo
1,"GAO found, based on a questionnaire sent to st...",1,blog/thinktank/ngo
2,Fossil fuels are way more expensive than you t...,0,blog/thinktank/ngo
3,"Last week, I posted a commentary on NASA scien...",1,blog/thinktank/ngo
4,The online German daily Die Welt here has an...,1,blog/thinktank/ngo
...,...,...,...
3322684,"In fact, published in Nature Scientific Report...",0,newspaper
3322685,As soon as one or two climate dominoes are kno...,1,newspaper
3322686,Angus Taylor should not be patting himself on ...,1,newspaper
3322687,"Imagine if that situation was reversed, and a ...",1,newspaper


In [11]:
valid = pd.concat([blogs_valid[['text', 'label', 'type']],
                  twitter_valid[['text', 'label', 'type']],
                  facebook_valid[['text', 'label', 'type']],
                  newspapers_valid[['text', 'label', 'type']]]).reset_index(drop = True)
valid

Unnamed: 0,text,label,type
0,We told you last week that the Chinese people ...,0,blog/thinktank/ngo
1,Joe reports breathlessly : \n Major analys...,1,blog/thinktank/ngo
2,Activist Daryl Hannah It used to be said that...,1,blog/thinktank/ngo
3,"WASHINGTON (March 10, 2016)—Today, United Stat...",0,blog/thinktank/ngo
4,"Packer ownership of livestock, and of hogs in ...",1,blog/thinktank/ngo
...,...,...,...
1107560,"Paradoxically, the calm hurricane season in th...",0,newspaper
1107561,"Enviros, in fact, can take some credit for the...",0,newspaper
1107562,That 2016 deadline is fast approaching. How mu...,0,newspaper
1107563,It is through such collusion that accidents ha...,0,newspaper


In [10]:
test = pd.concat([blogs_test[['text', 'label', 'type']],
                  twitter_test[['text', 'label', 'type']],
                  facebook_test[['text', 'label', 'type']],
                  newspapers_test[['text', 'label', 'type']]]).reset_index(drop = True)
test

Unnamed: 0,text,label,type
0,I've been fortunate to have met some people wi...,0,blog/thinktank/ngo
1,You may have heard that a draft of the Nationa...,0,blog/thinktank/ngo
2,Climate change will make the drought and flood...,0,blog/thinktank/ngo
3,If you own stocks or have money in a retiremen...,0,blog/thinktank/ngo
4,Investors and the global environment are at ri...,0,blog/thinktank/ngo
...,...,...,...
402848,Having calculated its carbon footprint for the...,0,newspaper
402849,"Two days later, Ocasio-Cortez unveiled the Gre...",0,newspaper
402850,Queenslanders are evocative of traditional Aus...,1,newspaper
402851,Out of this enormously complex report comes a ...,0,newspaper


In [12]:
train[['type', 'label']][train.label == 1].type.value_counts(normalize=True)

twitter               0.755028
facebook              0.160883
blog/thinktank/ngo    0.082854
newspaper             0.001235
Name: type, dtype: float64

In [13]:
train[['type', 'label']][train.label == 0].type.value_counts(normalize=True)

twitter               0.704595
facebook              0.245315
blog/thinktank/ngo    0.047795
newspaper             0.002295
Name: type, dtype: float64

In [14]:
train.label.value_counts(normalize=True)

1    0.540391
0    0.459609
Name: label, dtype: float64

In [15]:
# Save the data sets
train.to_csv('../data/babyCARDStrain.csv', index = False, encoding = 'utf-8')
valid.to_csv('../data/babyCARDSvalid.csv', index = False, encoding = 'utf-8')
test.to_csv('../data/babyCARDStest.csv', index = False, encoding = 'utf-8')