In [1]:
## Author: Mirjam Nanko
## Date Created: 2022-02-22
## Email: m.nanko@exeter.ac.uk

# socialCARDS data preparation<br>
#### This script loads and combines the **convinced** (0) and **contrarian** (1) posts sourced from **facebook and  twitter**, and splits it into **training, validation and testing** data sets. 
#### To get some measure of external validity, the testing data set is a "pure" held out data set that only contains text from "unseen" sources, i.e.  twitter and facebook accounts that the classifier was not trained on.

# Packages & functions

In [2]:
import os
import random
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk

In [3]:
def train_valid_test_split(data, account_variable,
                           random_test_sample_size = .15, 
                           valid_sample_size = .25, 
                           random_seed = 1, random_state = 1, 
                           shuffle = True):
    """Split the data into training, validation and testing data set, with the testing 
    data being completely held-out data by unseen accounts"""
    
    # Set a random seed
    random.seed(random_seed)
    
    # Extract unique usernames for each side
    contrarian_accounts = data[data["label"]==1][account_variable].unique()
    convinced_accounts = data[data["label"]==0][account_variable].unique()
    
    # Create a randomn sample of users for the testing set for each side
    samplesize_contrarian = int(round(random_test_sample_size*len(contrarian_accounts),0))
    test_contrarian = random.sample(list(contrarian_accounts), samplesize_contrarian)
    samplesize_convinced = int(round(random_test_sample_size*len(convinced_accounts),0))
    test_convinced = random.sample(list(convinced_accounts), samplesize_convinced)
    
    # Separate the testing dataset
    trainvalid = pd.concat([data[~data[account_variable].isin(test_contrarian + test_convinced)]])
    test = pd.concat([data[data[account_variable].isin(test_contrarian + test_convinced)]])
    
    # Split training data into training and validation dataset
    train, valid = train_test_split(trainvalid, test_size=valid_sample_size, 
                                    random_state=random_state, shuffle=shuffle)
    print("Training data set with {} posts created.".format(len(train)))
    print("Validation data set with {} posts created.".format(len(valid)))    
    print("Testing data set with {} posts created.".format(len(test)))    

    return train, valid, test

def load_tweets(folderpath, label, verbose = True):
    """This function loads twitter json files into a pandas dataframe and assigns them a
    numeric label."""
    tweets, username, name, date, time = [[] for _ in range(5)]
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        for line in open('/'.join([folderpath, file]), 'r'):
            tweets.append(json.loads(line)['tweet'])
            username.append(json.loads(line)['username'])
            name.append(json.loads(line)['name'])
            date.append(json.loads(line)['date'])
            time.append(json.loads(line)['time'])
    df = pd.DataFrame(list(zip(tweets, [label]*len(tweets), username, name, date, time)),
                    columns = ['text', 'label', 'username', 'name', 'date', 'time'])
    if verbose == True:
        print("\n")
    return df

def load_facebook(folderpath, label, verbose = True):
    """This function loads facebook json files into a pandas dataframe and assigns them a
    numeric label."""
    text, handle, ID, name, pageAdminTopCountry, pageCategory, date, postUrl, link = [[] for _ in range(9)]
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        posts = json.load(open('/'.join([folderpath, file])))['posts']
        for post in posts:
            text.append(post.get('message'))
            handle.append(post['account'].get('handle'))
            ID.append(post['account'].get('id'))
            name.append(post['account'].get('name'))
            pageAdminTopCountry.append(post['account'].get('pageAdminTopCountry'))
            pageCategory.append(post['account'].get('pageCategory'))
            date.append(post.get('date'))
            postUrl.append(post.get('postUrl'))
            link.append(post.get('link'))     
    df = pd.DataFrame(list(zip(text, [label]*len(text), handle, ID, name, pageAdminTopCountry, pageCategory, date, postUrl, link)),
                    columns = ['text', 'label', 'handle', 'ID', 'name', 'pageAdminTopCountry', 'pageCategory', 'date', 'postUrl', 'link'])
    df = df.dropna(subset=['text'])
    if verbose == True:
        print("\n")
    return df

# Load data

## Twitter

In [4]:
print('---Load the contrarian tweets---\n')
twitter_contrarian = load_tweets('../data/twitter/contrarian', label = 1, verbose = False)
print(len(twitter_contrarian), "contrarian tweets loaded.")

print('\n\n---Load the convinced tweets---\n')
twitter_convinced = load_tweets('../data/twitter/convinced', label = 0, verbose = False)
print(len(twitter_convinced), "convinced tweets loaded.\n\n")

print('---Split the data (test data: unseen twitter handles)---\n')
twitter = pd.concat([twitter_contrarian, twitter_convinced])
twitter['type'] = "twitter"
twitter_train, twitter_valid, twitter_test = train_valid_test_split(twitter, "username") # Here the held out data is unseen twitter accounts.

---Load the contrarian tweets---

1966699 contrarian tweets loaded.


---Load the convinced tweets---

1573621 convinced tweets loaded.


---Split the data (test data: unseen twitter handles)---

Training data set with 2431705 posts created.
Validation data set with 810569 posts created.
Testing data set with 298046 posts created.


## Facebook

In [5]:
print('---Load the contrarian facebook posts---\n')
facebook_contrarian = load_facebook('../data/facebook/contrarian', label = 1, verbose = False)
print(len(facebook_contrarian), "contrarian facebook posts loaded.")
# 449199 contrarian facebook posts loaded.

print('\n\n---Load the convinced facebook posts---\n')
facebook_convinced = load_facebook('../data/facebook/convinced', label = 0, verbose = False)
print(len(facebook_convinced), "convinced facebook posts loaded.\n\n")

print('---Split the data (test data: unseen facebook handles)---\n')
facebook = pd.concat([facebook_contrarian, facebook_convinced])
facebook['type'] = "facebook"
facebook_train, facebook_valid, facebook_test = train_valid_test_split(facebook, "handle") # Here the held out data is unseen facebook accounts.

---Load the contrarian facebook posts---

425551 contrarian facebook posts loaded.


---Load the convinced facebook posts---

527951 convinced facebook posts loaded.


---Split the data (test data: unseen facebook handles)---

Training data set with 663504 posts created.
Validation data set with 221168 posts created.
Testing data set with 68830 posts created.


# Combine & export data

In [6]:
train = pd.concat([twitter_train[['text', 'label', 'type']],
                  facebook_train[['text', 'label', 'type']]]).reset_index(drop = True)
train

Unnamed: 0,text,label,type
0,Notley digs for scraps of good under a mountai...,1,twitter
1,Busted Bugs: German Wind Turbines Demolishing ...,1,twitter
2,Lol. Stayed till the bitter end 😡😕 https://t....,1,twitter
3,July 14 1954 was one of the hottest days on re...,1,twitter
4,Understanding the value of nature for business...,0,twitter
...,...,...,...
3095204,Kardashian used social media to demean and slu...,0,facebook
3095205,Freeze them first!,0,facebook
3095206,"""This is a crisis of radical proportions, and ...",0,facebook
3095207,A summary of the best coverage of climate chan...,0,facebook


In [8]:
valid = pd.concat([twitter_valid[['text', 'label', 'type']],
                  facebook_valid[['text', 'label', 'type']]]).reset_index(drop = True)
valid

Unnamed: 0,text,label,type
0,The only way to make schools safe again is to ...,1,twitter
1,Big congrats to former CEI intern Andrew Gross...,1,twitter
2,Private enterprise versus free enterprise htt...,1,twitter
3,This mother is fighting woke activism in publi...,1,twitter
4,@nonkelpier @gryffroy @de_NVA LOL. Sure,1,twitter
...,...,...,...
1031732,The 2020 RNC kicked off with a fire hose of fa...,0,facebook
1031733,Lance Izumi's op-ed featured in Fort Worth Sta...,1,facebook
1031734,Lock China out of the capital markets!,1,facebook
1031735,'GRATEFUL TO BE SAFE': American ally shows tha...,1,facebook


In [7]:
test = pd.concat([twitter_test[['text', 'label', 'type']],
                  facebook_test[['text', 'label', 'type']]]).reset_index(drop = True)
test

Unnamed: 0,text,label,type
0,"The World Wants Greenland’s Minerals, but Gree...",1,twitter
1,Mayor Bronson defends use of Holocaust imagery...,1,twitter
2,"In Well-Vaccinated Maine, Covid-19 Still Fills...",1,twitter
3,"Australia set to announce ‘no jab, no pay’ rul...",1,twitter
4,Melbourne cases hit record despite two months ...,1,twitter
...,...,...,...
366871,NASA doesn’t study just the stars and planets;...,0,facebook
366872,"""He was playing, having a grand old time.""",0,facebook
366873,"Even by the standards of 90 million years ago,...",0,facebook
366874,"33 years ago today, a catastrophic nuclear acc...",0,facebook


In [9]:
train[['type', 'label']][train.label == 1].type.value_counts(normalize=True)

twitter     0.824346
facebook    0.175654
Name: type, dtype: float64

In [10]:
train[['type', 'label']][train.label == 0].type.value_counts(normalize=True)

twitter     0.741749
facebook    0.258251
Name: type, dtype: float64

In [11]:
train.label.value_counts(normalize=True)

1    0.531326
0    0.468674
Name: label, dtype: float64

In [12]:
# Save the data sets
train.to_csv('../data/socialCARDStrain.csv', index = False, encoding = 'utf-8')
valid.to_csv('../data/socialCARDSvalid.csv', index = False, encoding = 'utf-8')
test.to_csv('../data/socialCARDStest.csv', index = False, encoding = 'utf-8')