In [1]:
## Author: Mirjam Nanko
## Date Created: 2022-02-22
## Email: m.nanko@exeter.ac.uk

# sentenceCARDS data preparation<br>
#### This script loads and combines the **convinced** (0) and **contrarian** (1) data sourced from various **blogs, twitter, facebook and newspapers**,  disaggregates it into sentences and splits it into **training, validation and testing** data sets. 
#### To get some measure of external validity, the testing data set is a "pure" held out data set that only contains text from "unseen" sources, i.e. bloggers, twitter accounts, facebook accounts  and newspaper articles that the classifier was not trained on.

In [2]:
# 2do 
# split blogs and newspapers into sentences

# Packages & functions

In [3]:
import os
import random
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk

In [4]:
def train_valid_test_split(data, account_variable,
                           random_test_sample_size = .15, 
                           valid_sample_size = .25, 
                           random_seed = 1, random_state = 1, 
                           shuffle = True):
    """Split the data into training, validation and testing data set, with the testing 
    data being completely held-out data by unseen accounts"""
    
    # Set a random seed
    random.seed(random_seed)
    
    # Extract unique usernames for each side
    contrarian_accounts = data[data["label"]==1][account_variable].unique()
    convinced_accounts = data[data["label"]==0][account_variable].unique()
    
    # Create a randomn sample of users for the testing set for each side
    samplesize_contrarian = int(round(random_test_sample_size*len(contrarian_accounts),0))
    test_contrarian = random.sample(list(contrarian_accounts), samplesize_contrarian)
    samplesize_convinced = int(round(random_test_sample_size*len(convinced_accounts),0))
    test_convinced = random.sample(list(convinced_accounts), samplesize_convinced)
    
    # Separate the testing dataset
    trainvalid = pd.concat([data[~data[account_variable].isin(test_contrarian + test_convinced)]])
    test = pd.concat([data[data[account_variable].isin(test_contrarian + test_convinced)]])
    
    # Split training data into training and validation dataset
    train, valid = train_test_split(trainvalid, test_size=valid_sample_size, 
                                    random_state=random_state, shuffle=shuffle)
    print("Training data set with {} posts created.".format(len(train)))
    print("Validation data set with {} posts created.".format(len(valid)))    
    print("Testing data set with {} posts created.".format(len(test)))    

    return train, valid, test

def load_tweets(folderpath, label, verbose = True):
    """This function loads twitter json files into a pandas dataframe and assigns them a
    numeric label."""
    tweets, username, name, date, time = [[] for _ in range(5)]
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        for line in open('/'.join([folderpath, file]), 'r'):
            tweets.append(json.loads(line)['tweet'])
            username.append(json.loads(line)['username'])
            name.append(json.loads(line)['name'])
            date.append(json.loads(line)['date'])
            time.append(json.loads(line)['time'])
    df = pd.DataFrame(list(zip(tweets, [label]*len(tweets), username, name, date, time)),
                    columns = ['text', 'label', 'username', 'name', 'date', 'time'])
    if verbose == True:
        print("\n")
    return df

def load_facebook(folderpath, label, verbose = True):
    """This function loads facebook json files into a pandas dataframe and assigns them a
    numeric label."""
    text, handle, ID, name, pageAdminTopCountry, pageCategory, date, postUrl, link = [[] for _ in range(9)]
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        posts = json.load(open('/'.join([folderpath, file])))['posts']
        for post in posts:
            text.append(post.get('message'))
            handle.append(post['account'].get('handle'))
            ID.append(post['account'].get('id'))
            name.append(post['account'].get('name'))
            pageAdminTopCountry.append(post['account'].get('pageAdminTopCountry'))
            pageCategory.append(post['account'].get('pageCategory'))
            date.append(post.get('date'))
            postUrl.append(post.get('postUrl'))
            link.append(post.get('link'))     
    df = pd.DataFrame(list(zip(text, [label]*len(text), handle, ID, name, pageAdminTopCountry, pageCategory, date, postUrl, link)),
                    columns = ['text', 'label', 'handle', 'ID', 'name', 'pageAdminTopCountry', 'pageCategory', 'date', 'postUrl', 'link'])
    df = df.dropna(subset=['text'])
    if verbose == True:
        print("\n")
    return df

def load_news(folderpath, label, verbose = True):
    """This function loads factive news csv files into a pandas dataframe and assigns them a
    numeric label."""
    temp = []
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        df_temp = pd.read_csv('/'.join([folderpath, file]), encoding = 'utf-8', lineterminator = '\n', low_memory = False)
        df_temp.drop(columns=['SC', 'CY', 'RE', 'PUB', 'NS', 'CR', 'IPD', 'IPC', 'CT', 'IN', 'RF', 'LA', 'CO', 'ET', 'ED', 'PG'], inplace = True)
        df_temp.rename(columns = {'AN':'accession_number','SE': 'section', 'HD': 'headline','WC':'wordcount', 'PD': 'date','SN': 'source',
                            'LP': 'lead', 'TD': 'text', 'BY':'author'}, inplace=True)
        df_temp["ID"] = df_temp['accession_number'].str.extract("Document (.+?)_\d*")
        df_temp["label"] = label
        temp.append(df_temp)
    df = pd.concat(temp, axis=0, ignore_index=True)
    if verbose == True:
        print("\n")
    return df

# Load data

## Blogs

In [5]:
print('---Load the blog posts (by blogs, thinktanks and NGOs)---')
blogs = pd.read_csv('../data/blogs/baby_content_cleanest.csv', encoding = 'utf-8', lineterminator='\n', 
                   usecols = ['org', 'date', 'title', 'url', 'org_type', 'org_side', 'text'])
blogs["label"] = [1 if i=="contrarian" else 0 for i in blogs.org_side]
print("\n", len(blogs[blogs.label == 1]), "contrarian blog posts loaded.")
print("\n", len(blogs[blogs.label == 0]), "convinced blog posts loaded.\n\n")

print('---Split the data (test data: unseen organisationss)---\n')
# blogs = blogs.rename(columns={"org_type": "type"})
blogs['type'] = "blog/thinktank/ngo"
blogs['text'] = blogs['text'].apply(nltk.tokenize.sent_tokenize)
blogs = blogs.explode('text')
blogs_train, blogs_valid, blogs_test = train_valid_test_split(blogs, "org") # Here the held out data is unseen bloggers.

---Load the blog posts (by blogs, thinktanks and NGOs)---

 219535 contrarian blog posts loaded.

 110773 convinced blog posts loaded.


---Split the data (test data: unseen organisationss)---

Training data set with 5462504 posts created.
Validation data set with 1820835 posts created.
Testing data set with 995101 posts created.


## Twitter

In [8]:
print('---Load the contrarian tweets---\n')
twitter_contrarian = load_tweets('../data/twitter/contrarian', label = 1, verbose = False)
print(len(twitter_contrarian), "contrarian tweets loaded.")

print('\n\n---Load the convinced tweets---\n')
twitter_convinced = load_tweets('../data/twitter/convinced', label = 0, verbose = False)
print(len(twitter_convinced), "convinced tweets loaded.\n\n")

print('---Split the data (test data: unseen twitter handles)---\n')
twitter = pd.concat([twitter_contrarian, twitter_convinced])
twitter['type'] = "twitter"
twitter['text'] = twitter['text'].apply(nltk.tokenize.sent_tokenize)
twitter = twitter.explode('text')
twitter_train, twitter_valid, twitter_test = train_valid_test_split(twitter, "username") # Here the held out data is unseen twitter accounts.

---Load the contrarian tweets---

1966699 contrarian tweets loaded.


---Load the convinced tweets---

1573621 convinced tweets loaded.


---Split the data (test data: unseen twitter handles)---

Training data set with 3972836 posts created.
Validation data set with 1324279 posts created.
Testing data set with 577191 posts created.


## Facebook

In [9]:
print('---Load the contrarian facebook posts---\n')
facebook_contrarian = load_facebook('../data/facebook/contrarian', label = 1, verbose = False)
print(len(facebook_contrarian), "contrarian facebook posts loaded.")
# 449199 contrarian facebook posts loaded.

print('\n\n---Load the convinced facebook posts---\n')
facebook_convinced = load_facebook('../data/facebook/convinced', label = 0, verbose = False)
print(len(facebook_convinced), "convinced facebook posts loaded.\n\n")

print('---Split the data (test data: unseen facebook handles)---\n')
facebook = pd.concat([facebook_contrarian, facebook_convinced])
facebook['type'] = "facebook"
facebook['text'] = facebook['text'].apply(nltk.tokenize.sent_tokenize)
facebook = facebook.explode('text')
facebook_train, facebook_valid, facebook_test = train_valid_test_split(facebook, "handle") # Here the held out data is unseen facebook accounts.

---Load the contrarian facebook posts---

425551 contrarian facebook posts loaded.


---Load the convinced facebook posts---

527951 convinced facebook posts loaded.


---Split the data (test data: unseen facebook handles)---

Training data set with 1073363 posts created.
Validation data set with 357788 posts created.
Testing data set with 190020 posts created.


## Newspapers

In [10]:
print('---Load the contrarian news articles---\n')
newspaper_contrarian =  load_news("../data/newspapers/contrarian", label = 1, verbose = False)
print(len(newspaper_contrarian), "contrarian news article sentences loaded.")

print('\n\n---Load the convinced news articles---\n')
newspaper_convinced =  load_news("../data/newspapers/convinced", label = 0, verbose = False)
print(len(newspaper_convinced), "convinced news article sentences loaded.\n\n")

print('---Split the data (test data: unseen newspaper articles)---\n')
newspapers = pd.concat([newspaper_contrarian, newspaper_convinced])
newspapers = newspapers.groupby(['headline', 'wordcount', 'date', 'source', 'lead', 'time_frame', 'ID', 'label'])['text'].apply(' '.join).reset_index()
newspapers[newspapers.select_dtypes(['object']).columns] = newspapers[newspapers.select_dtypes(['object']).columns].apply(lambda x: x.str.replace("(.\\'.{2}.\\'.{2})", ""))
newspapers['type'] = "newspaper"
newspapers['text'] = newspapers['text'].apply(nltk.tokenize.sent_tokenize)
newspapers = newspapers.explode('text')
newspapers_train, newspapers_valid, newspapers_test = train_valid_test_split(newspapers, "ID") # Here the held out data is unseen newspaper articles.

---Load the contrarian news articles---

141954 contrarian news article sentences loaded.


---Load the convinced news articles---

118665 convinced news article sentences loaded.


---Split the data (test data: unseen newspaper articles)---

Training data set with 289054 posts created.
Validation data set with 96352 posts created.
Testing data set with 70831 posts created.


# Combine & export data

In [11]:
train = pd.concat([blogs_train[['text', 'label', 'type']],
                  twitter_train[['text', 'label', 'type']],
                  facebook_train[['text', 'label', 'type']],
                  newspapers_train[['text', 'label', 'type']]]).reset_index(drop = True)
train

Unnamed: 0,text,label,type
0,And Japan sharply walked back its pledge to re...,0,blog/thinktank/ngo
1,And if it is told to take this set of numbers ...,1,blog/thinktank/ngo
2,The President's speech to a joint session of C...,1,blog/thinktank/ngo
3,"However, rates for those policies will be regu...",1,blog/thinktank/ngo
4,Has there ever been a golden age of\nliberty?,1,blog/thinktank/ngo
...,...,...,...
10797752,"And then the tears began, drawing an unimpress...",1,newspaper
10797753,Renewable energy projects tend to be considera...,0,newspaper
10797754,He has every right to be a skeptic - all scien...,0,newspaper
10797755,The outbreak of novel coronavirus is an evolvi...,1,newspaper


In [13]:
valid = pd.concat([blogs_valid[['text', 'label', 'type']],
                  twitter_valid[['text', 'label', 'type']],
                  facebook_valid[['text', 'label', 'type']],
                  newspapers_valid[['text', 'label', 'type']]]).reset_index(drop = True)
valid

Unnamed: 0,text,label,type
0,Image by Jared and Corin.,1,blog/thinktank/ngo
1,”Andrea Dillon (thell1885@gmail.com) writes fr...,1,blog/thinktank/ngo
2,The ones used for Tropospheric temperature mea...,0,blog/thinktank/ngo
3,"Around 4:20, journalist Abby Martin adds her c...",1,blog/thinktank/ngo
4,"This year, assuming all the countries show and...",0,blog/thinktank/ngo
...,...,...,...
3599249,The science of climate change tells us we need...,0,newspaper
3599250,Rises in sea level are expected to be especial...,0,newspaper
3599251,They turned out along the coast and deep inlan...,1,newspaper
3599252,Approximately have paid a record US$3 billion ...,0,newspaper


In [12]:
test = pd.concat([blogs_test[['text', 'label', 'type']],
                  twitter_test[['text', 'label', 'type']],
                  facebook_test[['text', 'label', 'type']],
                  newspapers_test[['text', 'label', 'type']]]).reset_index(drop = True)
test

Unnamed: 0,text,label,type
0,I've been fortunate to have met some people wi...,0,blog/thinktank/ngo
1,"One such person was Dr. Paul Epstein, a physic...",0,blog/thinktank/ngo
2,"Dr. Epstein, associate director of the Center ...",0,blog/thinktank/ngo
3,People don't just want to know how climate cha...,0,blog/thinktank/ngo
4,They want to know what's at stake for them.,0,blog/thinktank/ngo
...,...,...,...
1833138,Princeton philosopher Harry Frankfurt called b...,1,newspaper
1833139,Everyone knows this (but) we have no clear un...,1,newspaper
1833140,"Its become much worse since then, given a clim...",1,newspaper
1833141,In the 19th century Thorstein Veblen coined th...,1,newspaper


In [14]:
train[['type', 'label']][train.label == 1].type.value_counts(normalize=True)

blog/thinktank/ngo    0.550564
twitter               0.355244
facebook              0.074529
newspaper             0.019663
Name: type, dtype: float64

In [15]:
train[['type', 'label']][train.label == 0].type.value_counts(normalize=True)

blog/thinktank/ngo    0.438624
twitter               0.387037
facebook              0.136867
newspaper             0.037471
Name: type, dtype: float64

In [16]:
train.label.value_counts(normalize=True)

1    0.600932
0    0.399068
Name: label, dtype: float64

In [17]:
# Save the data sets
train.to_csv('../data/sentenceCARDStrain.csv', index = False, encoding = 'utf-8')
valid.to_csv('../data/sentenceCARDSvalid.csv', index = False, encoding = 'utf-8')
test.to_csv('../data/sentenceCARDStest.csv', index = False, encoding = 'utf-8')