In [1]:
## Author: Mirjam Nanko
## Date Created: 2022-02-24
## Email: m.nanko@exeter.ac.uk

# climatearticlebabyCARDS data preparation<br>
#### This script loads and combines the **convinced** (0) and **contrarian** (1) data sourced from various **blogs and newspapers**, subsets the data to documents mentioning the term *climate* or *global warming* and splits it into **training, validation and testing** data sets. 
#### To get some measure of external validity, the testing data set is a "pure" held out data set that only contains text from "unseen" sources, i.e. bloggers and newspaper articles that the classifier was not trained on.

# Packages & functions

In [11]:
import os
import random
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk

ModuleNotFoundError: No module named 'sklearn'

In [None]:
def train_valid_test_split(data, account_variable,
                           random_test_sample_size = .15, 
                           valid_sample_size = .25, 
                           random_seed = 1, random_state = 1, 
                           shuffle = True):
    """Split the data into training, validation and testing data set, with the testing 
    data being completely held-out data by unseen accounts"""
    
    # Set a random seed
    random.seed(random_seed)
    
    # Extract unique usernames for each side
    contrarian_accounts = data[data["label"]==1][account_variable].unique()
    convinced_accounts = data[data["label"]==0][account_variable].unique()
    
    # Create a randomn sample of users for the testing set for each side
    samplesize_contrarian = int(round(random_test_sample_size*len(contrarian_accounts),0))
    test_contrarian = random.sample(list(contrarian_accounts), samplesize_contrarian)
    samplesize_convinced = int(round(random_test_sample_size*len(convinced_accounts),0))
    test_convinced = random.sample(list(convinced_accounts), samplesize_convinced)
    
    # Separate the testing dataset
    trainvalid = pd.concat([data[~data[account_variable].isin(test_contrarian + test_convinced)]])
    test = pd.concat([data[data[account_variable].isin(test_contrarian + test_convinced)]])
    
    # Split training data into training and validation dataset
    train, valid = train_test_split(trainvalid, test_size=valid_sample_size, 
                                    random_state=random_state, shuffle=shuffle)
    print("Training data set with {} posts created.".format(len(train)))
    print("Validation data set with {} posts created.".format(len(valid)))    
    print("Testing data set with {} posts created.".format(len(test)))    

    return train, valid, test

def load_news(folderpath, label, verbose = True):
    """This function loads factive news csv files into a pandas dataframe and assigns them a
    numeric label."""
    temp = []
    for i, file in enumerate(os.listdir(folderpath)):
        if verbose == True:
            print(i, file)
        df_temp = pd.read_csv('/'.join([folderpath, file]), encoding = 'utf-8', lineterminator = '\n', low_memory = False)
        df_temp.drop(columns=['SC', 'CY', 'RE', 'PUB', 'NS', 'CR', 'IPD', 'IPC', 'CT', 'IN', 'RF', 'LA', 'CO', 'ET', 'ED', 'PG'], inplace = True)
        df_temp.rename(columns = {'AN':'accession_number','SE': 'section', 'HD': 'headline','WC':'wordcount', 'PD': 'date','SN': 'source',
                            'LP': 'lead', 'TD': 'text', 'BY':'author'}, inplace=True)
        df_temp["ID"] = df_temp['accession_number'].str.extract("Document (.+?)_\d*")
        df_temp["label"] = label
        temp.append(df_temp)
    df = pd.concat(temp, axis=0, ignore_index=True)
    if verbose == True:
        print("\n")
    return df

In [139]:
# Match the factiva search terms
factiva = "|".join(["climate change", "global warming", "greenhouse gas", "CO2", "carbon dioxide", "extreme weather", "fossil fuel.{1}",
                    "renewable energy", "carbon footprint", "carbon price", "carbon pricing", "carbon tax", "climate change and trade",
                    "climate change and cap", "carbon storage", "IPCC", "Paris Agreement", "sustainable energy", "Green New Deal",
                    "energy efficiency", "green energy", "nuclear power", "solar power", "wind power", "eco-friendly", "GHG", "green washing",
                     "(?:^(?=.*(?:COP12|COP13|COP14|COP15|COP16|COP17|COP18|COP19|COP20|COP21|COP22|COP23|COP24|COP25|COP26))(?=.*climate))",
                    "(?:^(?=.*(?:solution|policy|electr.{1}|energy|ozone|temperature|aerosols|UNFCCC|INDC|methane|G7 summit|sustainab.{1}))(?=.*(?:climate change|global warming|climate)))"]).lower()

# Load data

### Blogs, thinktanks and NGOs

In [None]:
# Load the data
print('---Load the blog posts (by blogs, thinktanks and NGOs)---')
blogs = pd.read_csv('../data/blogs/baby_content_cleanest.csv', encoding = 'utf-8', lineterminator='\n', 
                   usecols = ['org', 'date', 'title', 'url', 'org_type', 'org_side', 'text'])
blogs["label"] = [1 if i=="contrarian" else 0 for i in blogs.org_side]
print("\n", len(blogs[blogs.label == 1]), "contrarian blog posts loaded.")
print("\n", len(blogs[blogs.label == 0]), "convinced blog posts loaded.\n\n")

# Subset the data to articles matching the factiva search terms
print('---Subset the posts matching the factiva search terms---')
blogs = blogs.rename(columns={"org_type": "type"})
blogs['type'] = "blog/thinktank/ngo"
blogs = blogs[blogs['text'].str.lower().str.contains(factiva, regex=True)]
print("\n", len(blogs[blogs.label == 1]), "contrarian blog posts remain.")
print("\n", len(blogs[blogs.label == 0]), "convinced blog posts remain.\n\n")

# Split the data 
print('---Split the data (test data: unseen organisationss)---\n')
blogs_train, blogs_valid, blogs_test = train_valid_test_split(blogs, "org") # Here the held out data is unseen bloggers.

---Load the blog posts (by blogs, thinktanks and NGOs)---

 219535 contrarian blog posts loaded.

 110773 convinced blog posts loaded.


---Subset the posts matching the factiva search terms---


### Newspapers

In [None]:
# Load the data
print('---Load the contrarian news articles---\n')
newspaper_contrarian =  load_news("../data/newspapers/contrarian", label = 1, verbose = False)
print(len(newspaper_contrarian), "contrarian news article sentences loaded.")
print('\n\n---Load the convinced news articles---\n')
newspaper_convinced =  load_news("../data/newspapers/convinced", label = 0, verbose = False)
print(len(newspaper_convinced), "convinced news article sentences loaded.\n\n")

# Subset the data to articles matching the factiva search terms
print('---Subset the posts matching the factiva search terms---')
newspapers = pd.concat([newspaper_contrarian, newspaper_convinced])
newspapers = newspapers.groupby(['headline', 'wordcount', 'date', 'source', 'lead', 'time_frame', 'ID', 'label'])['text'].apply(' '.join).reset_index()
newspapers[newspapers.select_dtypes(['object']).columns] = newspapers[newspapers.select_dtypes(['object']).columns].apply(lambda x: x.str.replace("(.\\'.{2}.\\'.{2})", ""))
newspapers['type'] = "newspaper"
newspapers = newspapers[newspapers['text'].str.lower().str.contains(factiva, regex=True)]
print(len(newspaper_contrarian), "contrarian news article sentences remain.")
print(len(newspaper_convinced), "convinced news article sentences remain.\n\n")

# Split the data
print('---Split the data (test data: unseen newspaper articles)---\n')
newspapers_train, newspapers_valid, newspapers_test = train_valid_test_split(newspapers, "ID") # Here the held out data is unseen newspaper articles.

# Combine & export data

In [8]:
train = pd.concat([blogs_train[['text', 'label', 'type']],
                  newspapers_train[['text', 'label', 'type']]]).reset_index(drop = True)
train

Unnamed: 0,text,label,type
0,The November issue of Environment & Climate Ne...,1,blog/thinktank/ngo
1,This article in the Guardian offers us quite a...,1,blog/thinktank/ngo
2,The University of East Anglia who were at the ...,1,blog/thinktank/ngo
3,"WASHINGTON (April 29, 2008) – Deputy Premier o...",0,blog/thinktank/ngo
4,Climate Change Weekly #83New data from the Nat...,1,blog/thinktank/ngo
...,...,...,...
379865,The three countries have been offered almost a...,0,newspaper
379866,"The US bank JP Morgan Chase , whose economists...",0,newspaper
379867,"The fund, previously known as the Sustainable ...",1,newspaper
379868,Human ingenuity has created a way of living th...,0,newspaper


In [9]:
valid = pd.concat([blogs_valid[['text', 'label', 'type']],
                  twitter_valid[['text', 'label', 'type']],
                  facebook_valid[['text', 'label', 'type']],
                  newspapers_valid[['text', 'label', 'type']]]).reset_index(drop = True)
valid

Unnamed: 0,text,label,type
0,From the Wall Street Journal : \n The letter ...,1,blog/thinktank/ngo
1,Oregon's Prius (hybrid) tax credit could go as...,1,blog/thinktank/ngo
2,"There's an African proverb which says ""when yo...",0,blog/thinktank/ngo
3,The News | News Around 300 coal miners and p...,1,blog/thinktank/ngo
4,March 4th was the birthday or deathday of at l...,1,blog/thinktank/ngo
...,...,...,...
126621,"Back in 2006, when Congress passed a $2,000 cr...",0,newspaper
126622,"The study, which was published in the Nature t...",0,newspaper
126623,Research funding is closely related to publish...,1,newspaper
126624,"Once the rod is spent, it is returned for proc...",1,newspaper


In [10]:
test = pd.concat([blogs_test[['text', 'label', 'type']],
                  twitter_test[['text', 'label', 'type']],
                  facebook_test[['text', 'label', 'type']],
                  newspapers_test[['text', 'label', 'type']]]).reset_index(drop = True)
test

Unnamed: 0,text,label,type
0,I've been fortunate to have met some people wi...,0,blog/thinktank/ngo
1,You may have heard that a draft of the Nationa...,0,blog/thinktank/ngo
2,Climate change will make the drought and flood...,0,blog/thinktank/ngo
3,If you own stocks or have money in a retiremen...,0,blog/thinktank/ngo
4,Investors and the global environment are at ri...,0,blog/thinktank/ngo
...,...,...,...
61137,The ACCR report raises the bar for super funds...,1,newspaper
61138,"And then theres the large, black Puma knapsack...",0,newspaper
61139,A major upgrade of the Vales Point coal-fired ...,1,newspaper
61140,Jane Caro and Diana are only contemporaries in...,1,newspaper


In [11]:
train[['type', 'label']][train.label == 1].type.value_counts(normalize=True)

blog/thinktank/ngo    0.441389
twitter               0.438138
facebook              0.110115
newspaper             0.010358
Name: type, dtype: float64

In [12]:
train[['type', 'label']][train.label == 0].type.value_counts(normalize=True)

twitter               0.678080
blog/thinktank/ngo    0.173496
facebook              0.133583
newspaper             0.014841
Name: type, dtype: float64

In [13]:
train.label.value_counts(normalize=True)

0    0.552181
1    0.447819
Name: label, dtype: float64

In [14]:
# Save the data sets
train.to_csv('../data/climatebabyCARDStrain.csv', index = False, encoding = 'utf-8')
valid.to_csv('../data/climatebabyCARDSvalid.csv', index = False, encoding = 'utf-8')
test.to_csv('../data/climatebabyCARDStest.csv', index = False, encoding = 'utf-8')