In [1]:
## Author: Mirjam Nanko
## Date Created: 2022-02-24
## Email: m.nanko@exeter.ac.uk

# climatenewsbabyCARDS data preparation<br>
#### This script loads and combines the **convinced** (0) and **contrarian** (1) data sourced from various **blogs and newspapers**, subsets the data to documents mentioning the term *climate* or *global warming* and splits it into **training, validation and testing** data sets. 
#### To get some measure of external validity, the testing data set is a "pure" held out data set that only contains text from "unseen" sources, i.e. bloggers and newspaper articles that the classifier was not trained on.

# Packages & functions

In [2]:
import os
import random
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk

In [3]:
def train_valid_test_split(data, account_variable,
                           random_test_sample_size = .15, 
                           valid_sample_size = .25, 
                           random_seed = 1, random_state = 1, 
                           shuffle = True):
    """Split the data into training, validation and testing data set, with the testing 
    data being completely held-out data by unseen accounts"""
    
    # Set a random seed
    random.seed(random_seed)
    
    # Extract unique usernames for each side
    contrarian_accounts = data[data["label"]==1][account_variable].unique()
    convinced_accounts = data[data["label"]==0][account_variable].unique()
    
    # Create a randomn sample of users for the testing set for each side
    samplesize_contrarian = int(round(random_test_sample_size*len(contrarian_accounts),0))
    test_contrarian = random.sample(list(contrarian_accounts), samplesize_contrarian)
    samplesize_convinced = int(round(random_test_sample_size*len(convinced_accounts),0))
    test_convinced = random.sample(list(convinced_accounts), samplesize_convinced)
    
    # Separate the testing dataset
    trainvalid = pd.concat([data[~data[account_variable].isin(test_contrarian + test_convinced)]])
    test = pd.concat([data[data[account_variable].isin(test_contrarian + test_convinced)]])
    
    # Split training data into training and validation dataset
    train, valid = train_test_split(trainvalid, test_size=valid_sample_size, 
                                    random_state=random_state, shuffle=shuffle)
    print("Training data set with {} posts created.".format(len(train)))
    print("Validation data set with {} posts created.".format(len(valid)))    
    print("Testing data set with {} posts created.".format(len(test)))    

    return train, valid, test

def load_news(folderpath, label, verbose = True):
    """This function loads factive news csv files into a pandas dataframe and assigns them a
    numeric label."""
    temp = []
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        df_temp = pd.read_csv('/'.join([folderpath, file]), encoding = 'utf-8', lineterminator = '\n', low_memory = False)
        df_temp.drop(columns=['SC', 'CY', 'RE', 'PUB', 'NS', 'CR', 'IPD', 'IPC', 'CT', 'IN', 'RF', 'LA', 'CO', 'ET', 'ED', 'PG'], inplace = True)
        df_temp.rename(columns = {'AN':'accession_number','SE': 'section', 'HD': 'headline','WC':'wordcount', 'PD': 'date','SN': 'source',
                            'LP': 'lead', 'TD': 'text', 'BY':'author'}, inplace=True)
        df_temp["ID"] = df_temp['accession_number'].str.extract("Document (.+?)_\d*")
        df_temp["label"] = label
        temp.append(df_temp)
    df = pd.concat(temp, axis=0, ignore_index=True)
    if verbose == True:
        print("\n")
    return df

In [4]:
# Match the factiva search terms
factiva = "|".join(["climate change", "global warming", "greenhouse gas", "CO2", "carbon dioxide", "extreme weather", "fossil fuel.{1}",
                    "renewable energy", "carbon footprint", "carbon price", "carbon pricing", "carbon tax", "climate change and trade",
                    "climate change and cap", "carbon storage", "IPCC", "Paris Agreement", "sustainable energy", "Green New Deal",
                    "energy efficiency", "green energy", "nuclear power", "solar power", "wind power", "eco-friendly", "GHG", "green washing",
                     "(?:^(?=.*(?:COP12|COP13|COP14|COP15|COP16|COP17|COP18|COP19|COP20|COP21|COP22|COP23|COP24|COP25|COP26))(?=.*climate))",
                    "(?:^(?=.*(?:solution|policy|electr.{1}|energy|ozone|temperature|aerosols|UNFCCC|INDC|methane|G7 summit|sustainab.{1}))(?=.*(?:climate change|global warming|climate)))"]).lower()

# Load data

### Blogs, thinktanks and NGOs

In [5]:
# Load the data
print('---Load the blog posts (by blogs, thinktanks and NGOs)---')
blogs = pd.read_csv('../data/blogs/baby_content_cleanest.csv', encoding = 'utf-8', lineterminator='\n', 
                   usecols = ['org', 'date', 'title', 'url', 'org_type', 'org_side', 'text'])
blogs["label"] = [1 if i=="contrarian" else 0 for i in blogs.org_side]
print("\n", len(blogs[blogs.label == 1]), "contrarian blog posts loaded.")
print("\n", len(blogs[blogs.label == 0]), "convinced blog posts loaded.\n\n")

# Subset the data to articles matching the factiva search terms
print('---Subset the posts matching the factiva search terms---')
blogs = blogs.rename(columns={"org_type": "type"})
blogs['type'] = "blog/thinktank/ngo"
blogs = blogs[blogs['text'].str.lower().str.contains(factiva, regex=True)]
print("\n", len(blogs[blogs.label == 1]), "contrarian blog posts remain.")
print("\n", len(blogs[blogs.label == 0]), "convinced blog posts remain.\n\n")

# Split the data 
print('---Split the data (test data: unseen organisationss)---\n')
blogs_train, blogs_valid, blogs_test = train_valid_test_split(blogs, "org") # Here the held out data is unseen bloggers.

---Load the blog posts (by blogs, thinktanks and NGOs)---

 219535 contrarian blog posts loaded.

 110773 convinced blog posts loaded.


---Subset the posts matching the factiva search terms---

 111391 contrarian blog posts remain.

 60746 convinced blog posts remain.


---Split the data (test data: unseen organisationss)---

Training data set with 115993 posts created.
Validation data set with 38665 posts created.
Testing data set with 17479 posts created.


### Newspapers

In [20]:
# Load the data
print('---Load the contrarian news articles---\n')
# newspaper_contrarian = load_news("../data/newspapers/contrarian", label = 1, verbose = False)
# newspaper_contrarian = newspaper_contrarian.groupby(['headline', 'wordcount', 'date', 'source', 'lead', 'time_frame', 'ID', 'label'])['text'].apply(' '.join).reset_index()

print(len(newspaper_contrarian), "contrarian news articles loaded.")
print('\n\n---Load the convinced news articles---\n')
# newspaper_convinced = load_news("../data/newspapers/convinced", label = 0, verbose = False)
# newspaper_convinced = newspaper_convinced.groupby(['headline', 'wordcount', 'date', 'source', 'lead', 'time_frame', 'ID', 'label'])['text'].apply(' '.join).reset_index()
print(len(newspaper_convinced), "convinced news articles loaded.\n\n")

# Subset the data to articles matching the factiva search terms
print('---Subset the posts matching the factiva search terms---')
newspapers = pd.concat([newspaper_contrarian, newspaper_convinced])
# newspapers = newspapers.groupby(['headline', 'wordcount', 'date', 'source', 'lead', 'time_frame', 'ID', 'label'])['text'].apply(' '.join).reset_index()
newspapers[newspapers.select_dtypes(['object']).columns] = newspapers[newspapers.select_dtypes(['object']).columns].apply(lambda x: x.str.replace("(.\\'.{2}.\\'.{2})", "", regex = True))
newspapers['type'] = "newspaper"
newspapers = newspapers[newspapers['text'].str.lower().str.contains(factiva, regex=True)]
print(len(newspaper_contrarian), "contrarian news articles remain.")
print(len(newspaper_convinced), "convinced news articles remain.\n\n")

# Split the data
print('---Split the data (test data: unseen newspaper articles)---\n')
newspapers_train, newspapers_valid, newspapers_test = train_valid_test_split(newspapers, "ID") # Here the held out data is unseen newspaper articles.

---Load the contrarian news articles---

3489 contrarian news articles loaded.


---Load the convinced news articles---

5488 convinced news articles loaded.


---Subset the posts matching the factiva search terms---
3489 contrarian news articles remain.
5488 convinced news articles remain.


---Split the data (test data: unseen newspaper articles)---

Training data set with 5530 posts created.
Validation data set with 1844 posts created.
Testing data set with 1301 posts created.


# Combine & export data

In [21]:
train = pd.concat([blogs_train[['text', 'label', 'type']],
                  newspapers_train[['text', 'label', 'type']]]).reset_index(drop = True)
train

Unnamed: 0,text,label,type
0,"Campaigners say Obamas re-election, superstorm...",1,blog/thinktank/ngo
1,Whether or not the United States should ratify...,1,blog/thinktank/ngo
2,Global [fraud] spurs global protest - The Chin...,1,blog/thinktank/ngo
3,The last post is a complete report of the ful...,1,blog/thinktank/ngo
4,"As regular readers will be aware, I think the ...",0,blog/thinktank/ngo
...,...,...,...
121518,The group is calling for the nations medium an...,1,newspaper
121519,"On Wednesday, the European Union unveiled an a...",0,newspaper
121520,"On the one hand, global investment in renewabl...",0,newspaper
121521,The Greens are fixated at the window of opport...,1,newspaper


In [22]:
valid = pd.concat([blogs_valid[['text', 'label', 'type']],
                  newspapers_valid[['text', 'label', 'type']]]).reset_index(drop = True)
valid

Unnamed: 0,text,label,type
0,The American Petroleum Institute has launched ...,0,blog/thinktank/ngo
1,Here’s a quote related to the McShane and Wyn...,1,blog/thinktank/ngo
2,The House of Commons Energy and Climate Change...,1,blog/thinktank/ngo
3,News from our partners is coming in fast and f...,0,blog/thinktank/ngo
4,In a raft of articles on this blog and else...,1,blog/thinktank/ngo
...,...,...,...
40504,https://images.theconversation.com/files/27431...,0,newspaper
40505,Mr Bandt has today stood by his accusations th...,1,newspaper
40506,"In what had been a nonpolitical process, the U...",0,newspaper
40507,Those are just the most obvious of personal us...,0,newspaper


In [23]:
test = pd.concat([blogs_test[['text', 'label', 'type']],
                  newspapers_test[['text', 'label', 'type']]]).reset_index(drop = True)
test

Unnamed: 0,text,label,type
0,I've been fortunate to have met some people wi...,0,blog/thinktank/ngo
1,You may have heard that a draft of the Nationa...,0,blog/thinktank/ngo
2,Climate change will make the drought and flood...,0,blog/thinktank/ngo
3,If you own stocks or have money in a retiremen...,0,blog/thinktank/ngo
4,Investors and the global environment are at ri...,0,blog/thinktank/ngo
...,...,...,...
18775,But if anyone thought that the weather was inc...,0,newspaper
18776,"West, the owner of a houseboat timeshare compa...",0,newspaper
18777,"Four zoomers Patrick Baggaley, Freya Scott-Tu...",0,newspaper
18778,"On the 10th anniversary of the riot, community...",0,newspaper


In [10]:
train[['type', 'label']][train.label == 1].type.value_counts(normalize=True)

blog/thinktank/ngo    0.97259
newspaper             0.02741
Name: type, dtype: float64

In [11]:
train[['type', 'label']][train.label == 0].type.value_counts(normalize=True)

blog/thinktank/ngo    0.922733
newspaper             0.077267
Name: type, dtype: float64

In [12]:
train.label.value_counts(normalize=True)

1    0.637048
0    0.362952
Name: label, dtype: float64

In [13]:
# Save the data sets
train.to_csv('../data/climatenewsbabyCARDStrain.csv', index = False, encoding = 'utf-8')
valid.to_csv('../data/climatenewsbabyCARDSvalid.csv', index = False, encoding = 'utf-8')
test.to_csv('../data/climatenewsbabyCARDStest.csv', index = False, encoding = 'utf-8')