In [32]:
import re
import json
from pathlib import Path
from collections import Counter, OrderedDict

import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
ARTICLE_TEXTS_DIR = "../../data/processed/articles/test/texts/"
ARTICLE_LABELS = "../../data/processed/articles/test/labels/labels.jsonl"
ARTICLE_SUMMARIES = "../../notebooks/poc/Text summarization with PreSumm/results/bertsumext_cnndm.extractive.default.txt"
ARTICLE_STOCK_MATCHES = "../../data/test/user_study/temp/article_matches_v2.json"
ARTICLE_OUTPUT_DIR = "../../data/test/user_study/articles"

ARTICLES_TO_IGNORE = [2, 13, 16, 18, 21, 25, 30, 31, 33, 38, 40, 42, 45, 47, 48, 49]

### Sorting out article paths

In [34]:
def get_article_id(path):
    return int(path.name.split(".")[0])


article_paths = sorted(list(Path(ARTICLE_TEXTS_DIR).glob("*.txt")), 
                       key=get_article_id)

article_paths = [path for path in article_paths 
                 if get_article_id(path) not in ARTICLES_TO_IGNORE]

len(article_paths)

36

### ETL article texts

In [35]:
def header_ish(line):
    if not line[0].isalpha():
        return False
    if line[0].startswith("Now") or line[0].startswith("And"):
        return False
    if line[-1].endswith(":") or line[-1].endswith(".") or line[-1].endswith("”") or line[-1].endswith("\""):
        return False
    return True


def htmlize_raw_text(path, consider_ltc_below=15):
    article_html = list()
    with open(path, "rt", encoding="utf-8") as fp:   
        for j, line in enumerate(fp):
            line = line.replace("\n", "").strip()
            tokens = line.split(" ")
            if tokens != [""]:
                if j == 0:
                    line = "<h3>" + line + "</h3>"
                elif j not in [0, 1, 2] and len(tokens) < consider_ltc_below and header_ish(line):
                    line = "<h6>" + line + "</h6>"
                else:
                    line = "<p>" + line + "</p>"

                article_html.append(line)
    return article_html

In [22]:
consider_ltc_below = 15  # `ltc` stands for line token count

articles = dict()
for path in article_paths:
    if get_article_id(path) not in ARTICLES_TO_IGNORE:            
        articles[get_article_id(path)] = htmlize_raw_text(path)

In [23]:
articles.keys()

dict_keys([0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 20, 22, 23, 24, 26, 27, 28, 29, 32, 34, 35, 36, 37, 39, 41, 43, 44, 46, 50, 51])

### ETL article labels

In [24]:
def read_jsonl_file(filename):
    objects = list()
    with open(filename, "r", encoding="utf-8") as fp:
        for line in fp:
            obj = json.loads(line)
            objects.append(obj)
    return objects


def filter_keys(labels, keys):
    labels_filtered = list()
    for label in labels:
        filtered_dict = {k:v for k,v in label.items() if k in keys}
        labels_filtered.append(filtered_dict)
    return labels_filtered

In [25]:
relevant_keys = ["doc_id", "title", "url", "company", "ticker_top"]
article_labels = filter_keys(read_jsonl_file(ARTICLE_LABELS), relevant_keys)
article_labels = [label for label in article_labels
                  if label["doc_id"] not in ARTICLES_TO_IGNORE]

article_labels

[{'doc_id': 0,
  'title': 'AT&T CFO Pascal Desroches Updates Shareholders',
  'company': 'AT&T',
  'url': 'https://finance.yahoo.com/news/t-cfo-pascal-desroches-updates-003500023.html',
  'ticker_top': ['T']},
 {'doc_id': 1,
  'title': 'Ford to build $11.4 billion mega campuses for electric car production',
  'company': 'Ford',
  'url': 'https://finance.yahoo.com/news/ford-to-build-two-114-billion-mega-campuses-for-electric-car-production-230016248.html',
  'ticker_top': ['F']},
 {'doc_id': 3,
  'title': 'NIO: Consumers Will Soon Realize NIO Has What Other EVs Are Missing, BaaS',
  'company': 'NIO',
  'url': 'https://seekingalpha.com/article/4457311-nio-has-what-other-evs-are-missing-baas',
  'ticker_top': ['NIO']},
 {'doc_id': 4,
  'title': 'EV Competition Has Arrived: Does Tesla Have The Answer?',
  'company': 'Tesla',
  'url': 'https://seekingalpha.com/article/4457328-ev-competition-has-arrived-does-tesla-have-answer',
  'ticker_top': ['TSLA']},
 {'doc_id': 5,
  'title': 'Better Bee

### ETL article summaries

In [26]:
def read_txt_lines(filename):
    with open(filename, "rt", encoding="utf-8") as fp:
        return [line for line in fp]
    
    
def replace_tokens(summary, token_pairs):
    for (old, new) in token_pairs:
        summary = summary.replace(old, new)
    return summary

In [27]:
token_pairs = [("<q>", ". "), ("..", "."), ("  ", " "), ("\n", "")]
summaries_raw = read_txt_lines(ARTICLE_SUMMARIES)
summaries = [replace_tokens(summary, token_pairs) for summary in summaries_raw]
summaries = [summary for i, summary in enumerate(summaries)
             if i not in ARTICLES_TO_IGNORE]

print(summaries_raw[3], "\n")
print(summaries[3])

summaries

Their battery as a service technology solves a lot of short and long-term problems with EVs and cars in general.<q>NIO is a leading player in the Chinese electric vehicle market and their unique technology grants it several long-term advantages over competitors.<q>China has been on a legislative tear recently, hurting many domestic companies. However, NIO's mission lines up with many Chinese goals and is likely to enjoy preferential treatment.
 

Tesla trades at a high 20x P/S and is overvalued. Summary. EV Competition Has Arrived: Does Tesla Have The Answer?


['AT&T CFO Pascal Desroches Updates Shareholders. DALLAS, September 15, 2021--(BUSINESS WIRE)--Pascal Desroches, senior executive vice president and chief financial officer of AT&T Inc.* (NYSE:T), spoke today at the Bank of America Media, Communications & Entertainment Conference, where he provided an update to shareholders. AT&T also continues to experience healthy demand for HBO Max in both domestic and international markets. The company recently announced its plans to expand into six European countries next month with plans to launch in at least 14 additional territories in Europe in 2022. As previously indicated, AT&T expects most of the subscriber growth in the second half of the year to come from outside the United States due to the strategic decision to cease offering HBO Max as a subscription on Amazon Channels. The company anticipates this decision will likely impact total HBO Max / HBO domestic subscribers and net additions in the third quarter. However, Desroches reiterated 

### ETL article stock matches

In [28]:
def read_json_file(filename):
    with open(filename, "r") as fp:
        return json.load(fp)

In [29]:
article_stock_matches = read_json_file(ARTICLE_STOCK_MATCHES)
article_stock_matches

{'0': [{'index': 439,
   'stock_name': 'Lumen Technologies',
   'ticker_symbol': 'LUMN',
   'sector': 'Communications',
   'industry': 'Specialty Telecommunications',
   'comment': 'Lumen Technologies (formerly CenturyLink) is an American telecommunications company headquartered in Monroe, Louisiana, that offers communications, network services, security, cloud solutions, voice, and managed services. The company is a member of the S&P 500 index and the Fortune 500. Its communications services include local and long-distance voice, broadband, Multi-Protocol Label Switching (MPLS), private line (including special access), Ethernet, hosting (including cloud hosting and managed hosting), data integration, video, network, public access, Voice over Internet Protocol (VoIP), information technology, and other ancillary services. Lumen also serves global enterprise customers across North America, Latin America, EMEA (Europe, Middle East, and Africa), and Asia Pacific.',
   'score': 0.7538793208

### Gathering and joining data

In [36]:
article_data = dict()
for article_contents, summary, labels, stocks in (zip(articles.values(), summaries, 
                                                  article_labels, article_stock_matches.values())):
    idx = labels["doc_id"]
    article_data[idx] = dict(labels)
    article_data[idx]["contents"] = article_contents
    article_data[idx]["summary"] = summary
    article_data[idx]["stock_matches"] = stocks

In [37]:
output_path = Path(ARTICLE_OUTPUT_DIR)

for article_id, data in article_data.items():
    output_file = output_path / f"{article_id}.json"
    with open(output_file, "w") as fp:
        json.dump(data, fp, ensure_ascii=False, indent="\t")