ACL2022 - Analysis and data-processing of AllSides ARTICLE-LEVEL 

In [5]:
import jsonlines
from collections import Counter, defaultdict
import pandas as pd
import json
import random

from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from rouge import Rouge
rouge = Rouge()

from sklearn.model_selection import train_test_split


In [10]:
import re

porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

def preprocess_text(text, join_again=True):
    text = text.replace("U.S.", "USA")
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text)
    tokens = [wordnet_lemmatizer.lemmatize(w) for w in word_tokenize(text)]
    
    if join_again:
        text = " ".join(tokens)
        return text
    else:
        return tokens

def preprocess_texts(text_list):
    return [preprocess_text(text) for text in text_list]
# print(preprocess_text(s))

In [11]:
def remove_dotdotdot(article_text):
#     print(article_text)
    sents = [ sent for sent in sent_tokenize(article_text) if '...' not in sent]
    
#     print(" ".join(sents))
    
    return " ".join(sents)

### 0. Load crawled data & preprocess ( can skip if using the file we provided ) 

In [12]:
with jsonlines.open('data/all_crawled_combined_filtered_RoundupLeftCenterRight.jsonl') as reader:
# with jsonlines.open('data/all_crawled_combined_filtered_LeftCenterRight_WITHOUT_Roundup.jsonl') as reader:
    all_objs = [obj for obj in reader]
len(all_objs)

FileNotFoundError: [Errno 2] No such file or directory: 'data/all_crawled_combined_filtered_RoundupLeftCenterRight.jsonl'

In [None]:
# new version for acl2022 ARR version: data/headline_11_2021-10-19_crawled_all.jsonl
with jsonlines.open('data/headline_11_2021-10-19_crawled_all.jsonl') as reader:
    new_objs = [obj for obj in reader]
len(new_objs)

In [None]:
# filter and obtain only those having all roundup/left/center/right. 
filterd_new_objs = []
for obj in new_objs:
    if obj['roundup'] != [] and len(obj['news']) == 3:
        filterd_new_objs.append(obj)
len(filterd_new_objs)

In [None]:
all_objs.extend(filterd_new_objs)

In [None]:
len(all_objs)

Filter by Rouge

In [None]:
print("Before Filtering: ", len(lefts))

filtered_obj = []

cnt = 0
for idx, (left, right, center, issue, topic, roundup) in enumerate(zip(lefts, rights, centers, issues, topics, roundups)):
    combined_title = " ".join([left['newsTitle'], right['newsTitle'], center['newsTitle']])
    single_rouge_score = rouge.get_scores(preprocess_text(combined_title), preprocess_text(issue))
    rouge1_recall_score = single_rouge_score[0]['rouge-1']['r']
    
    # filter step 1: filter ROUGE1-R < 0.3
    if str(rouge1_recall_score)[:3] >= '0.3':
        # filter step 2: keyword-based removal
        if "reaction" in issue.lower() or "follow-up" in issue.lower() or "perspective" in issue.lower():
#             print(issue) # 175 filtered
            continue
        # filter step 3: remove short sentences. but how short?
        if len(issue.split(" ")) > 3:

            filtered_obj.append({
                "left": left, 
                "right": right,
                "center": center, 
                "issue": issue,
                "topic": topic, 
                "roundup": roundup
            })
    
    
print("After Filtering: ", len(filtered_obj))


In [None]:
'''
    OBJ FORMAT:
    {
        "left": left, 
        "right": right,
        "center": center, 
        "issue": issue,
        "topic": topic, 
        "roundup": roundup
    }

'''
FILTER_WORDS = [
    'left-',
    'right-',
    'right-rated',
    'left-rated',
    'coverage',
    'coverage,',
    'coverage.',
    'outlets',
    'outlets,',
    'left-leaning',
    'right-leaning',
    'reporting'
]
FILTER_WORDS_set = set(FILTER_WORDS)

In [None]:
is_filtering_polarity_mention = True
do_print = False

PREPROCESSED_FILTERED_OBJ = []
for idx, obj in enumerate(filtered_obj):       
    # ONLY SHOWING LAST PART OF ROUND UP
    updated_roundup = []
    if len(obj['roundup']) == 0 or len(obj['roundup']) >3:
        continue
    for idx_2, l in enumerate(obj['roundup']):
        if idx_2 == 0:
            updated_roundup.append(l)
        else:
            if not is_filtering_polarity_mention:    
                if do_print:
                    print(l, "\n")
            else:
                split_l = set(l.lower().split())
                inter = split_l.intersection(FILTER_WORDS_set)
                if len(inter) == 0 and l != '\xa0':
                    if do_print:
                        print(l, "\n")
                    updated_roundup.append(l)
                    filtered_obj[idx]['roundup'] = []
        obj['roundup'] = updated_roundup
    PREPROCESSED_FILTERED_OBJ.append(obj)
                    
#     print()
#     if idx % 2000 == 0 and idx > 1000:
#         break

In [None]:
len(PREPROCESSED_FILTERED_OBJ)

In [None]:
PREPROCESSED_FILTERED_OBJ[0]

## 1. Format and Save PREPROCESSED_FILTERED_OBJ

Instruction: Replace `PREPROCESSED_FILTERED_OBJ` with new objs if you want to use your own custom dataset

In [None]:
with open("data/acl2022_filtered_allsides_article.json", "r") as infile:
    PREPROCESSED_FILTERED_OBJ = json.load(infile) 

### Version 1: standard format

In [None]:
'''
    ######### Format ########
    
    SOURCE: body_text_1 [SEP] body_text_2 [SEP] body_text_3
    
    TARGET: target_text
    
'''

In [None]:
def create_source_target_with_processing(objs, phase):
    
    target_path = 'data/acl2022_lrc_roundup_random_order/{}.target'.format(phase)
    source_path = 'data/acl2022_lrc_roundup_random_order/{}.source'.format(phase)

    for idx, obj in enumerate(objs):
        # center always in the beginning
        # left, right --> half/half

        left_body = remove_dotdotdot(" ".join(obj['left']['newBody']))
        right_body = remove_dotdotdot(" ".join(obj['right']['newBody']))
        center_body = remove_dotdotdot(" ".join(obj['center']['newBody']))
        
        # shutffle the order of political orientation in the source doc
        
        all_bodies = [('L', left_body), ('R', right_body), ('C', center_body)]
        random.shuffle(all_bodies)
        
        source = " [SEP] ".join([item[1] for item in all_bodies]).replace("\n", "")
        source_order_string = " [SEP] ".join([item[0] for item in all_bodies])
        
        target = " ".join(obj['roundup']).replace("\n", "")
        

        with open(target_path, "a") as target_file:
            target_file.write(target)
            target_file.write("\n")

        with open(source_path, "a") as source_file: 
            source_file.write(source)
            source_file.write("\n")


        with open('data/acl2022_lrc_roundup_random_order.source_order.{}.txt'.format(phase), "a") as outfile:
            outfile.write(source_order_string)
            outfile.write("\n")


In [None]:
# for allsides
# split all_objs into train/val/test
article_train, article_not_train = train_test_split(PREPROCESSED_FILTERED_OBJ, test_size=0.2, random_state=42)
article_val, article_test = train_test_split(article_not_train, test_size=0.5, random_state=42)

In [None]:
create_source_target_with_processing(article_train, 'train')

create_source_target_with_processing(article_val, 'val')

create_source_target_with_processing(article_test, 'test')

### Version 2: probe format (TITLE=> title_content. ARTICLE=> article_content)

In [14]:
'''
    ######### Format ########
    
    SOURCE: TITLE=> title_text_1. ARTICLE=> body_text_1. [SEP] 
            TITLE=> title_text_2. ARTICLE=> body_text_2. [SEP] 
            TITLE=> title_text_3. ARTICLE=> body_text_3.
    
    TARGET: TITLE=> target_title. ARTICLE=> target_body_text.
    
'''

In [None]:
def create_source_target_with_processing_probe_format(objs, phase):
    
    target_path = 'data/lrc_roundup_random_order_probe/{}.target'.format(phase)
    source_path = 'data/lrc_roundup_random_order_probe/{}.source'.format(phase)

    for idx, obj in enumerate(objs):

        left_body = remove_dotdotdot(" ".join(obj['left']['newBody']))
        right_body = remove_dotdotdot(" ".join(obj['right']['newBody']))
        center_body = remove_dotdotdot(" ".join(obj['center']['newBody']))
        
        left_title = obj['left']['newsTitle']
        right_title = obj['right']['newsTitle']
        center_title = obj['center']['newsTitle']
        
        
        all_bodies = [('L', left_body, left_title), ('R', right_body, right_title), ('C', center_body, center_title)]
        random.shuffle(all_bodies)

        source = " [SEP] ".join([ "TITLE=> {}. ARTICLE=> {}".format(item[2],item[1]) for item in all_bodies]).replace("\n", "")
        source_order_string = " [SEP] ".join([item[0] for item in all_bodies])
        
        
        roundup = " ".join(obj['roundup']).replace("\n", "")
        target = "TITLE=> {}. ARTICLE=> {}".format(obj['issue'], roundup)
            
#         print(source, "\n")
#         print(target, "\n")
#         break
        
        
        with open(target_path, "a") as target_file:
            target_file.write(target)
            target_file.write("\n")

        with open(source_path, "a") as source_file: 
            source_file.write(source)
            source_file.write("\n")



In [None]:
# for allsides
# split all_objs into train/val/test
article_train, article_not_train = train_test_split(PREPROCESSED_FILTERED_OBJ, test_size=0.2, random_state=42)
article_val, article_test = train_test_split(article_not_train, test_size=0.5, random_state=42)

In [None]:
create_source_target_with_processing_probe_format(article_train, 'train')

create_source_target_with_processing_probe_format(article_val, 'val')

create_source_target_with_processing_probe_format(article_test, 'test')