In [4]:
import re
import json
from pathlib import Path
from collections import Counter, OrderedDict

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
ARTICLE_TEXTS_DIR = "../../data/processed/articles/validation//texts/"
ARTICLE_LABELS = "../../data/processed/articles/validation/labels/labels.json"
ARTICLE_SUMMARIES = "../../notebooks/poc/Text summarization with PreSumm/results/results.validation.txt"

ARTICLE_STOCK_MATCHES = "../../data/test/user_study/temp/article_matches_v1.validation.json"
ARTICLE_OUTPUT_DIR = "../../data/test/user_study/articles/validation"

### Sorting out article paths

In [6]:
def get_article_id(path):
    return path.name.split(".")[0]

article_paths = sorted(list(Path(ARTICLE_TEXTS_DIR).glob("*.txt")), 
                       key=get_article_id)

len(article_paths)

10

### ETL article texts

In [7]:
def header_ish(line):
    if not line[0].isalpha():
        return False
    if line[0].startswith("Now") or line[0].startswith("And"):
        return False
    if line[-1].endswith(":") or line[-1].endswith(".") or line[-1].endswith("”") or line[-1].endswith("\""):
        return False
    return True


def htmlize_raw_text(path, consider_ltc_below=15):
    article_html = list()
    with open(path, "rt", encoding="utf-8") as fp:   
        for j, line in enumerate(fp):
            line = line.replace("\n", "").strip()
            tokens = line.split(" ")
            if tokens != [""]:
                if j == 0:
                    line = "<h3>" + line + "</h3>"
                elif j not in [0, 1, 2] and len(tokens) < consider_ltc_below and header_ish(line):
                    line = "<h6>" + line + "</h6>"
                else:
                    line = "<p>" + line + "</p>"

                article_html.append(line)
    return article_html

In [8]:
consider_ltc_below = 15  # `ltc` stands for line token count

articles = dict()
for path in article_paths:
    articles[get_article_id(path)] = htmlize_raw_text(path)
    
articles.keys()

dict_keys(['oos_0', 'oos_1', 'oos_2', 'oos_3', 'oos_4', 'oos_5', 'oos_6', 'oos_7', 'oos_8', 'oos_9'])

In [9]:
articles["oos_0"]

['<h3>Bezos’ Blue Origin loses NASA lawsuit over SpaceX $2.9 billion lunar lander contract</h3>',
 '<p>KEY POINTS</p>',
 '<p>1. The U.S. Court of Federal Claims ruled against Jeff Bezos’ Blue Origin in the company’s lawsuit against NASA over a $2.9 billion lunar lander contract awarded to SpaceX.</p>',
 '<p>2. Federal judge Richard Hertling sided with the defense in his ruling, completing a months-long battle.</p>',
 '<p>3. A Blue Origin spokesperson said in a statement that the company’s lawsuit “highlighted the important safety issues with the Human Landing System procurement process that must still be addressed.”</p>',
 '<p>4. Musk, in a tweet replying to CNBC’s report on the ruling, posted a photo from the 2012 movie “Dredd.”</p>',
 '<p>The U.S. Court of Federal Claims ruled against Jeff Bezos’ Blue Origin on Thursday in the company’s lawsuit versus NASA over a lucrative astronaut lunar lander contract awarded to Elon Musk’s SpaceX earlier this year.</p>',
 '<p>Federal Judge Richar

### ETL article labels

In [10]:
def read_json_file(filename):
    with open(filename, "r") as fp:
        return json.load(fp)

In [11]:
article_labels = read_json_file(ARTICLE_LABELS)
article_labels.keys()

dict_keys(['oos_O', 'oos_1', 'oos_2', 'oos_3', 'oos_4', 'oos_5', 'oos_6', 'oos_7', 'oos_8', 'oos_9'])

### ETL article summaries

In [12]:
def read_txt_lines(filename):
    with open(filename, "rt", encoding="utf-8") as fp:
        return [line for line in fp]
    
    
def replace_tokens(summary, token_pairs):
    for (old, new) in token_pairs:
        summary = summary.replace(old, new)
    return summary

In [13]:
token_pairs = [("<q>", ". "), ("..", "."), ("  ", " "), ("\n", "")]
summaries_raw = read_txt_lines(ARTICLE_SUMMARIES)
summaries = [replace_tokens(summary, token_pairs) for summary in summaries_raw]

print(summaries_raw[3], "\n")
print(summaries[3])

The Reddit revolt: GameStop and the impact of social media on institutional investors<q>The Reddit revolution in the US has drawn attention to the potential power that a growing force of retail investors can wield in stock markets when equipped by social media.<q>Amateur investors have increasingly engaged with retail platforms in the last year, partly due to the pandemic leaving them idol at home, but also due to the newfound onslaught of information through social media and access to the market through retail brokerages and platforms such as Robinhood.
 

The Reddit revolt: GameStop and the impact of social media on institutional investors. The Reddit revolution in the US has drawn attention to the potential power that a growing force of retail investors can wield in stock markets when equipped by social media. Amateur investors have increasingly engaged with retail platforms in the last year, partly due to the pandemic leaving them idol at home, but also due to the newfound onslaugh

### ETL article stock matches

In [14]:
def read_json_file(filename):
    with open(filename, "r") as fp:
        return json.load(fp)

In [15]:
article_stock_matches = read_json_file(ARTICLE_STOCK_MATCHES)
article_stock_matches

{'0': [{'index': 731,
   'stock_name': 'Virgin Galactic',
   'ticker_symbol': 'SPCE',
   'sector': 'Electronic Technology',
   'industry': 'Aerospace & Defense',
   'comment': "Virgin Galactic (VG) is an American spaceflight company founded by Richard Branson and his British Virgin Group retains an 18% stake through Virgin Investments Limited. It is headquartered in California, USA, and operates from New Mexico. The company is developing commercial spacecraft and aims to provide suborbital spaceflights to space tourists. Virgin Galactic's suborbital spacecraft are air launched from beneath a carrier airplane known as White Knight Two. Virgin Galactic‘s maiden spaceflight occurred in 2018 with its VSS Unity spaceship. Branson had originally hoped to see a maiden spaceflight by 2010, but the date was delayed for several years, primarily due to the October 2014 crash of VSS Enterprise.",
   'score': 0.7157517413857688},
  {'index': 434,
   'stock_name': 'Lockheed Martin',
   'ticker_symbo

### Gathering and joining data

In [21]:
article_data = dict()
for article_contents, summary, (key, labels), stocks in (zip(articles.values(), summaries, 
                                                  article_labels.items(), article_stock_matches.values())):
    idx = key
    article_data[idx] = dict(labels)
    article_data[idx]["contents"] = article_contents
    article_data[idx]["summary"] = summary
    article_data[idx]["stock_matches"] = stocks

In [24]:
article_data['oos_O']

{'title': 'Bezos’ Blue Origin loses NASA lawsuit over SpaceX $2.9 billion lunar lander contract',
 'url': 'https://www.cnbc.com/2021/11/04/bezos-blue-origin-loses-lawsuit-against-nasa-over-spacex-lunar-lander.html',
 'type': 'out of sample',
 'contents': ['<h3>Bezos’ Blue Origin loses NASA lawsuit over SpaceX $2.9 billion lunar lander contract</h3>',
  '<p>KEY POINTS</p>',
  '<p>1. The U.S. Court of Federal Claims ruled against Jeff Bezos’ Blue Origin in the company’s lawsuit against NASA over a $2.9 billion lunar lander contract awarded to SpaceX.</p>',
  '<p>2. Federal judge Richard Hertling sided with the defense in his ruling, completing a months-long battle.</p>',
  '<p>3. A Blue Origin spokesperson said in a statement that the company’s lawsuit “highlighted the important safety issues with the Human Landing System procurement process that must still be addressed.”</p>',
  '<p>4. Musk, in a tweet replying to CNBC’s report on the ruling, posted a photo from the 2012 movie “Dredd.”<

In [28]:
output_path = Path(ARTICLE_OUTPUT_DIR)
output_path.mkdir(exist_ok=True)

for article_id, data in article_data.items():
    output_file = output_path / f"{article_id}.json"
    with open(output_file, "w") as fp:
        json.dump(data, fp, ensure_ascii=False, indent="\t")