In [1]:
import json
from pathlib import Path
from functools import reduce

import pandas as pd

from IPython.display import display

In [2]:
ARTICLES_DIR = "../../data/test/user_study/articles/"
STOCKS_PATH = "../../data/processed/stocks/revolut.2021-07-05.complete.stocks.jsonl"
RATINGS_PATH = "../../data/temp/dataclips_gdnjjptpwsyunzmhlqhqsodmmqda.json"

###  Sorting out paths

In [3]:
def key(path):
    stem = path.stem
    if stem.startswith("oos"):
        return int(stem[-1])
    else:
        return int(stem)

In [4]:
# Sorting out paths
article_paths = sorted(list(Path(ARTICLES_DIR).glob("*.json")), key=key)
article_paths

[PosixPath('../../data/test/user_study/articles/0.json'),
 PosixPath('../../data/test/user_study/articles/oos_0.json'),
 PosixPath('../../data/test/user_study/articles/1.json'),
 PosixPath('../../data/test/user_study/articles/oos_1.json'),
 PosixPath('../../data/test/user_study/articles/oos_2.json'),
 PosixPath('../../data/test/user_study/articles/3.json'),
 PosixPath('../../data/test/user_study/articles/oos_3.json'),
 PosixPath('../../data/test/user_study/articles/4.json'),
 PosixPath('../../data/test/user_study/articles/oos_4.json'),
 PosixPath('../../data/test/user_study/articles/oos_5.json'),
 PosixPath('../../data/test/user_study/articles/5.json'),
 PosixPath('../../data/test/user_study/articles/oos_6.json'),
 PosixPath('../../data/test/user_study/articles/6.json'),
 PosixPath('../../data/test/user_study/articles/oos_7.json'),
 PosixPath('../../data/test/user_study/articles/7.json'),
 PosixPath('../../data/test/user_study/articles/8.json'),
 PosixPath('../../data/test/user_study/a

### Inverting articles dict for relevant articles

In [5]:
def read_json(path):
    with open(path, "r") as fp:
        return json.load(fp)

In [6]:
articles = dict()
stocks = dict()

for path in article_paths:
    
    article = read_json(path)
    articles[article["doc_id"]] = article
    
    for stock in article["stock_matches"]:
        ticker = stock["ticker_symbol"]
        score = stock.pop("score")
        
        if ticker not in stocks.keys():    
            stocks[ticker] = stock
            stocks[ticker]["relevant_articles"] = list()
            
        article_match = {k:v for k,v in article.items() if k in ["doc_id", "title", "url", "summary"]}
        article_match["score"] = score
        stocks[ticker]["relevant_articles"].append(article_match)
        
        

In [7]:
for key, values in stocks.items():
    print("{}\t{}".format(key,len(values["relevant_articles"])))

LUMN	1
VG	1
CMCSA	1
CABO	1
VIACA	1
ROKU	1
ATUS	1
VZ	1
CHTR	1
GRPN	1
SPCE	2
LMT	2
WORK	2
KTOS	1
LI	4
PTRA	3
CVNA	1
NRG	5
SO	2
DTE	1
ARVL	1
F	1
GLW	1
AEP	2
CNP	1
TW	2
CME	1
HES	3
NMR	2
MPC	2
AR	2
NOV	2
KKR	2
YPF	2
EQNR	3
BEP	2
RIO	3
TECK	3
COP	2
CVE	2
ESI	1
SRE	2
BHP	2
PCG	1
XPEV	4
NIO	3
TSLA	2
BLK	1
GME	1
NDAQ	5
BABA	5
GS	1
COIN	1
MS	1
UBS	1
IBKR	1
FB	2
TM	2
XLNX	5
HIMX	3
DIDI	3
NAVI	1
BNTX	3
NVAX	2
GSK	1
TEVA	1
SNY	1
DXC	1
AZN	2
ABBV	1
ARCT	1
MRNA	3
BUD	1
ABEV	2
KDP	1
KHC	1
STZ	1
HPE	1
YETI	1
MAR	2
YUMC	1
VFC	2
ARNC	1
HAL	1
OLN	1
H	2
TEAM	1
RCL	2
NCLH	2
CCL	3
NOC	1
HEI	1
SAVE	1
JBLU	1
AMC	1
LEVI	1
RL	2
REAL	1
URBN	1
GM	1
IP	1
BP	1
GGB	1
EDU	3
TME	3
TAL	1
CHKP	1
QCOM	2
NTAP	2
ZM	1
MCFE	1
WDC	3
VMW	1
PANW	1
PINS	1
FCEL	2
RSG	1
PLUG	1
CHPT	1
AMD	4
ZS	2
YY	5
STX	2
DQ	1
TSM	4
SWKS	2
NVDA	2
INTC	2
GPRO	1
ANET	1
MCHP	2
SPWR	1
TXN	1
CDNS	1
ON	1
ZNGA	1
ATVI	1
SPOT	1
AAPL	1
MA	1
ADBE	1
RBLX	1
PYPL	1
VLO	1
FANG	1
CLDR	2
BIP	2
RUN	1
PBR	1
SCCO	3
PBF	2
VALE	3
SID	3
OVV	2
SOFI	2
MDB	1
BSBR	3
SSNC	2


In [8]:
print(len(stocks))

246


In [9]:
test_articles_top_tickers = reduce(set.union, [set(article["ticker_top"]) for article in articles.values() 
                                               if "ticker_top" in article.keys()])

missed_top_tickers = test_articles_top_tickers - set(stocks.keys())
print("Top tickers that are not in any article", missed_top_tickers)

Top tickers that are not in any article {'PFE', 'FCX', 'XOM', 'T', 'X', 'BBD', 'CVX'}


### Getting top stocks that were not in any article

In [10]:
def read_jsonl(path, missed_tickers):
    with open(path, "r") as fp:
        for i, line in enumerate(fp):
            stock = json.loads(line)
            if stock["ticker_symbol"] in missed_tickers:
                stock_ = dict()
                stock_["index"] = i
                stock_.update({k:v for k,v in stock.items() 
                               if k in ["stock_name", "ticker_symbol", "sector", "industry", "comment"]})
                stock_["relevant_articles"] = []
                yield {stock_["ticker_symbol"]: stock_}

In [11]:
for stock_ in read_jsonl(STOCKS_PATH, missed_tickers=missed_top_tickers):
    stocks.update(stock_)
    
print(len(stocks))

253


### Setting aside stocks with at least two relavant articles

In [12]:
stocks_df = pd.DataFrame.from_dict(stocks, orient="index")
stocks_df["article_count"] = stocks_df.relevant_articles.apply(len)

set_aside_stocks = (stocks_df[(stocks_df.article_count > 2) | 
                              (stocks_df.ticker_symbol.isin(test_articles_top_tickers))]
                        .to_dict(orient="index"))

print(len(set_aside_stocks))
print(set_aside_stocks.keys())

56
dict_keys(['LI', 'PTRA', 'NRG', 'F', 'HES', 'EQNR', 'RIO', 'TECK', 'XPEV', 'NIO', 'TSLA', 'NDAQ', 'BABA', 'XLNX', 'HIMX', 'DIDI', 'BNTX', 'MRNA', 'ABEV', 'NCLH', 'CCL', 'AMC', 'EDU', 'TME', 'WDC', 'PLUG', 'AMD', 'YY', 'TSM', 'NVDA', 'INTC', 'AAPL', 'PBR', 'SCCO', 'VALE', 'SID', 'SOFI', 'BSBR', 'C', 'ITUB', 'EVR', 'ET', 'KSS', 'M', 'JD', 'VIPS', 'IQ', 'UBER', 'AAL', 'T', 'BBD', 'CVX', 'XOM', 'FCX', 'PFE', 'X'])


### Adding in stocks that have at least one evaluation rating

In [13]:
rated_stocks = read_json(RATINGS_PATH)
rated_stocks = [rated_stocks["values"][i][1] for i in range(len(rated_stocks["values"]))]

In [14]:
rated_stocks

['HES',
 'AAPL',
 'BABA',
 'ABEV',
 'VIPS',
 'ATVI',
 'NIO',
 'NOV',
 'UBER',
 'DIDI',
 'AMD',
 'AMC',
 'AAL',
 'CCL',
 'C',
 'KHC',
 'T',
 'ET',
 'VFC',
 'ROKU',
 'KDP',
 'ESI',
 'BSBR',
 'KHC',
 'T',
 'X',
 'NVDA',
 'TSLA',
 'AMC',
 'KHC',
 'CVX',
 'PFE',
 'SPCE',
 'INTC',
 'BBD',
 'MRNA',
 'VALE',
 'TSM',
 'NDAQ',
 'BABA',
 'XOM',
 'F',
 'FCX',
 'IQ',
 'ITUB',
 'KSS']

### Adding rated stocks to set aside stocks list

In [15]:
missing_stocks = set(rated_stocks) - set(set_aside_stocks.keys())
missing_stocks

{'ATVI', 'ESI', 'KDP', 'KHC', 'NOV', 'ROKU', 'SPCE', 'VFC'}

In [16]:
for ticker in missing_stocks:
    set_aside_stocks.update(dict({ticker: stocks[ticker]}))

In [17]:
set_aside_stocks.keys()

dict_keys(['LI', 'PTRA', 'NRG', 'F', 'HES', 'EQNR', 'RIO', 'TECK', 'XPEV', 'NIO', 'TSLA', 'NDAQ', 'BABA', 'XLNX', 'HIMX', 'DIDI', 'BNTX', 'MRNA', 'ABEV', 'NCLH', 'CCL', 'AMC', 'EDU', 'TME', 'WDC', 'PLUG', 'AMD', 'YY', 'TSM', 'NVDA', 'INTC', 'AAPL', 'PBR', 'SCCO', 'VALE', 'SID', 'SOFI', 'BSBR', 'C', 'ITUB', 'EVR', 'ET', 'KSS', 'M', 'JD', 'VIPS', 'IQ', 'UBER', 'AAL', 'T', 'BBD', 'CVX', 'XOM', 'FCX', 'PFE', 'X', 'ROKU', 'VFC', 'KDP', 'ATVI', 'SPCE', 'NOV', 'KHC', 'ESI'])

In [18]:
len(set_aside_stocks)

64

### Choosing representative stocks from sectors and industries  that are not set aside yet

In [19]:
set_aside_sectors_industries = [(stock["sector"], stock["industry"]) for stock in set_aside_stocks.values()]
print(len(set_aside_sectors_industries))
print(len(set(set_aside_sectors_industries)))

64
37


In [20]:
sectors_industries = stocks_df.groupby(["sector", "industry"])
remaining_serctors_indsturies = set(sectors_industries.groups.keys()) - set(set_aside_sectors_industries)
remaining_serctors_indsturies

{('Commercial Services', 'Advertising/Marketing Services'),
 ('Commercial Services', 'Financial Publishing/Services'),
 ('Communications', 'Specialty Telecommunications'),
 ('Communications', 'Telecommunications Equipment'),
 ('Communications', 'Wireless Telecommunications'),
 ('Consumer Durables', 'Homebuilding'),
 ('Consumer Non-Durables', 'Household/Personal Care'),
 ('Consumer Services', 'Broadcasting'),
 ('Consumer Services', 'Cable/Satellite TV'),
 ('Consumer Services', 'Restaurants'),
 ('Electronic Technology', 'Computer Communications'),
 ('Electronic Technology', 'Computer Processing Hardware'),
 ('Electronic Technology', 'Electronic Equipment/Instruments'),
 ('Energy Minerals', 'Oil Refining/Marketing'),
 ('Finance', 'Investment Managers'),
 ('Finance', 'Life/Health Insurance'),
 ('Finance', 'Real Estate Development'),
 ('Finance', 'Real Estate Investment Trusts'),
 ('Health Technology', 'Medical Specialties'),
 ('Health Technology', 'Pharmaceuticals: Generic'),
 ('Industrial

In [21]:
unrepresented_sectors_industries = []

for key, group in sectors_industries:
    if key in remaining_serctors_indsturies:
        group = group.sort_values("article_count", ascending=False)
        print(key)
        display(group)
        top_match = group[group.article_count > 1].head(1)
        if top_match is not None:
            top_match.pop("article_count")
            set_aside_stocks.update(top_match.to_dict(orient="index"))
            unrepresented_sectors_industries.append(key)


('Commercial Services', 'Advertising/Marketing Services')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
GRPN,326,Groupon,GRPN,Commercial Services,Advertising/Marketing Services,Groupon is an American global e-commerce marke...,"[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1


('Commercial Services', 'Financial Publishing/Services')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
MCO,476,Moody’s,MCO,Commercial Services,Financial Publishing/Services,"Moody's Corporation, often referred to as Mood...","[{'doc_id': 23, 'title': 'Citigroup floats $23...",1


('Communications', 'Specialty Telecommunications')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
LUMN,439,Lumen Technologies,LUMN,Communications,Specialty Telecommunications,Lumen Technologies (formerly CenturyLink) is a...,"[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1
VG,734,Vonage Holdings,VG,Communications,Specialty Telecommunications,"Vonage (/ˈvɒnɪdʒ/, legal name Vonage Holdings ...","[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1


('Communications', 'Telecommunications Equipment')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
CHL,161,China Mobile,CHL,Communications,Telecommunications Equipment,China Mobile is the trade name of both China M...,"[{'doc_id': 39, 'title': 'Alibaba stock tumble...",2


('Communications', 'Wireless Telecommunications')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
MBT,469,Mobile TeleSystems PJSC,MBT,Communications,Wireless Telecommunications,"MTS (Russian: Мобильные ТелеСистемы, МТС, ""Mob...","[{'doc_id': 39, 'title': 'Alibaba stock tumble...",1


('Consumer Durables', 'Homebuilding')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
LEN,425,Lennar,LEN,Consumer Durables,Homebuilding,Lennar Corporation is a home construction and ...,"[{'doc_id': 23, 'title': 'Citigroup floats $23...",1


('Consumer Non-Durables', 'Household/Personal Care')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
COTY,201,Coty,COTY,Consumer Non-Durables,Household/Personal Care,Coty Inc. is an American multinational beauty ...,"[{'doc_id': 36, 'title': 'Kohls CEO talks Amaz...",2
EL,270,Estee Lauder,EL,Consumer Non-Durables,Household/Personal Care,The Estée Lauder Companies Inc. (/ˈɛsteɪ ˈlɔːd...,"[{'doc_id': 37, 'title': 'Macy’s And Kohl’s De...",1


('Consumer Services', 'Broadcasting')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
VIACA,727,ViacomCBS,VIACA,Consumer Services,Broadcasting,ViacomCBS Inc. is an American diversified mult...,"[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1


('Consumer Services', 'Cable/Satellite TV')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
CMCSA,183,Comcast,CMCSA,Consumer Services,Cable/Satellite TV,Comcast Corporation (formerly registered as Co...,"[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1
CABO,126,Cable One,CABO,Consumer Services,Cable/Satellite TV,"Cable One, Inc. (NYSE: CABO) is an American br...","[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1
ATUS,31,Altice USA,ATUS,Consumer Services,Cable/Satellite TV,"Altice USA, Inc., commonly known as Altice, is...","[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1
CHTR,152,Charter Communications,CHTR,Consumer Services,Cable/Satellite TV,"Charter Communications, Inc., is an American t...","[{'doc_id': 0, 'title': 'AT&T CFO Pascal Desro...",1
NFLX,490,Netflix,NFLX,Consumer Services,Cable/Satellite TV,"Netflix, Inc. is an American over-the-top cont...","[{'doc_id': 43, 'title': 'iQIYI romantic drama...",1


('Consumer Services', 'Restaurants')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
YUMC,768,Yum!,YUMC,Consumer Services,Restaurants,"Yum China Holdings, Inc. (Chinese: 百胜中国; pinyi...","[{'doc_id': 'oos_6', 'title': 'When the Taliba...",1
DRI,215,Darden Restaurants,DRI,Consumer Services,Restaurants,"Darden Restaurants, Inc. is an American multi-...","[{'doc_id': 41, 'title': 'Why Vipshop Stock Cr...",1


('Electronic Technology', 'Computer Communications')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
PANW,538,Palo Alto Networks,PANW,Electronic Technology,Computer Communications,"Palo Alto Networks, Inc. (NYSE: PANW) is an Am...","[{'doc_id': 'oos_9', 'title': 'The privacy par...",1
ANET,66,Arista Networks,ANET,Electronic Technology,Computer Communications,Arista Networks (formerly Arastra) is an Ameri...,"[{'doc_id': 12, 'title': 'New Benchmark Leak R...",1


('Electronic Technology', 'Computer Processing Hardware')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
HPE,343,Hewlett Packard,HPE,Electronic Technology,Computer Processing Hardware,The Hewlett Packard Enterprise Company (HPE) i...,"[{'doc_id': 'oos_6', 'title': 'When the Taliba...",1


('Electronic Technology', 'Electronic Equipment/Instruments')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
FCEL,303,FuelCell Energy,FCEL,Electronic Technology,Electronic Equipment/Instruments,"FuelCell Energy, Inc. is an American fuel cell...","[{'doc_id': 10, 'title': 'Fresno County Will S...",2


('Energy Minerals', 'Oil Refining/Marketing')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
MPC,444,Marathon Petroleum,MPC,Energy Minerals,Oil Refining/Marketing,Marathon Petroleum Corporation is an American ...,"[{'doc_id': 'oos_1', 'title': 'Oil recovers fr...",2
PBF,544,PBF Energy,PBF,Energy Minerals,Oil Refining/Marketing,PBF Energy Inc. is a petroleum refiner and sup...,"[{'doc_id': 20, 'title': 'Petrobras (PBR) Aims...",2
VLO,717,Valero,VLO,Energy Minerals,Oil Refining/Marketing,Valero Energy Corporation is a Fortune 500 int...,"[{'doc_id': 19, 'title': 'Why Chevron And Exxo...",1


('Finance', 'Investment Managers')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
KKR,413,KKR & Co,KKR,Finance,Investment Managers,KKR & Co. Inc. (formerly known as Kohlberg Kra...,"[{'doc_id': 'oos_1', 'title': 'Oil recovers fr...",2
BLK,106,BlackRock,BLK,Finance,Investment Managers,"BlackRock, Inc. is an American multinational i...","[{'doc_id': 'oos_3', 'title': 'The Reddit revo...",1
IVZ,385,Invesco,IVZ,Finance,Investment Managers,Invesco Ltd. is an American independent invest...,"[{'doc_id': 26, 'title': 'Itaú Unibanco Holdin...",1


('Finance', 'Life/Health Insurance')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
MET,461,Metlife,MET,Finance,Life/Health Insurance,"MetLife, Inc. is the holding corporation for t...","[{'doc_id': 23, 'title': 'Citigroup floats $23...",1


('Finance', 'Real Estate Development')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
HGV,345,Hilton Grand Vacations,HGV,Finance,Real Estate Development,Hilton Grand Vacations Inc. is based in Orland...,"[{'doc_id': 51, 'title': 'American Airlines vs...",1


('Finance', 'Real Estate Investment Trusts')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
INVH,387,Invitation Homes,INVH,Finance,Real Estate Investment Trusts,Invitation Homes Inc. is the largest owner of ...,"[{'doc_id': 23, 'title': 'Citigroup floats $23...",1
SKT,663,Tanger Factory Outlet Centers,SKT,Finance,Real Estate Investment Trusts,"Tanger Factory Outlet Centers, Inc. (/ˈtæŋər/ ...","[{'doc_id': 36, 'title': 'Kohls CEO talks Amaz...",1


('Health Technology', 'Medical Specialties')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
INO,374,Inovio Pharmaceuticals In,INO,Health Technology,Medical Specialties,Inovio Pharmaceuticals is an American biotechn...,"[{'doc_id': 27, 'title': 'Moderna CEO Predicts...",2
EW,250,Edwards Lifesciences,EW,Health Technology,Medical Specialties,Edwards Lifesciences is an American medical te...,"[{'doc_id': 28, 'title': 'US panel backs COVID...",1


('Health Technology', 'Pharmaceuticals: Generic')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
TEVA,677,Teva Pharmaceutical Industries,TEVA,Health Technology,Pharmaceuticals: Generic,"Teva Pharmaceutical Industries Ltd., also know...","[{'doc_id': 'oos_5', 'title': 'COVID-19: What ...",1
ZTS,776,Zoetis,ZTS,Health Technology,Pharmaceuticals: Generic,Zoetis Inc. (/zō-EH-tis/) is an American drug ...,"[{'doc_id': 27, 'title': 'Moderna CEO Predicts...",1


('Industrial Services', 'Environmental Services')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
RSG,586,Republic Services,RSG,Industrial Services,Environmental Services,"Republic Services, Inc is the second largest p...","[{'doc_id': 10, 'title': 'Fresno County Will S...",1


('Non-Energy Minerals', 'Aluminum')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
ARNC,64,Arconic,ARNC,Non-Energy Minerals,Aluminum,Arconic Corporation is an American industrial ...,"[{'doc_id': 'oos_6', 'title': 'When the Taliba...",1


('Non-Energy Minerals', 'Precious Metals')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
IAG,364,Iamgold Corp,IAG,Non-Energy Minerals,Precious Metals,Iamgold Corporation (formerly Iamgold Internat...,"[{'doc_id': 32, 'title': 'Freeport-McMoRan - A...",2
AUY,764,Yamana Gold,AUY,Non-Energy Minerals,Precious Metals,Inc. is a Canadian company that owns and opera...,"[{'doc_id': 32, 'title': 'Freeport-McMoRan - A...",2
NEM,496,Newmont Mining,NEM,Non-Energy Minerals,Precious Metals,"Newmont, based in Greenwood Village, Colorado,...","[{'doc_id': 32, 'title': 'Freeport-McMoRan - A...",2
KGC,412,Kinross Gold Corporation,KGC,Non-Energy Minerals,Precious Metals,Kinross Gold Corporation is a Canadian-based g...,"[{'doc_id': 32, 'title': 'Freeport-McMoRan - A...",2
AG,288,First Majestic Silver Corp,AG,Non-Energy Minerals,Precious Metals,First Majestic Silver Corp. is a Canadian silv...,"[{'doc_id': 32, 'title': 'Freeport-McMoRan - A...",2
EGO,252,Eldorado Gold Corporation,EGO,Non-Energy Minerals,Precious Metals,Eldorado Gold Corporation is a Canadian compan...,"[{'doc_id': 35, 'title': 'Mining Firm Responsi...",1


('Process Industries', 'Chemicals: Specialty')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
DQ,214,Daqo New Energy Corp,DQ,Process Industries,Chemicals: Specialty,Daqo New Energy Corp. is a Chinese company eng...,"[{'doc_id': 11, 'title': 'AMD CEO Sees Chip Sh...",1


('Process Industries', 'Containers/Packaging')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
IP,381,International Paper Company,IP,Process Industries,Containers/Packaging,The International Paper Company (NYSE: IP) is ...,"[{'doc_id': 'oos_8', 'title': 'How Fast Fashio...",1


('Process Industries', 'Industrial Specialties')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
OLN,523,Olin,OLN,Process Industries,Industrial Specialties,Olin Corporation is an American manufacturer o...,"[{'doc_id': 'oos_6', 'title': 'When the Taliba...",1


('Producer Manufacturing', 'Electrical Products')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
SPWR,652,SunPower,SPWR,Producer Manufacturing,Electrical Products,SunPower Corporation (NASDAQ:SPWR) is an Ameri...,"[{'doc_id': 15, 'title': 'Can India’s proposed...",1


('Producer Manufacturing', 'Miscellaneous Manufacturing')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
YETI,765,YETI Holdings Inc,YETI,Producer Manufacturing,Miscellaneous Manufacturing,YETI is an American outdoor manufacturer compa...,"[{'doc_id': 'oos_6', 'title': 'When the Taliba...",1


('Retail Trade', 'Apparel/Footwear Retail')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
RL,577,Ralph Lauren,RL,Retail Trade,Apparel/Footwear Retail,Ralph Lauren Corporation is an American public...,"[{'doc_id': 'oos_8', 'title': 'How Fast Fashio...",2
URBN,714,Urban Outfitters,URBN,Retail Trade,Apparel/Footwear Retail,"Urban Outfitters, Inc. (URBN) is a multination...","[{'doc_id': 'oos_8', 'title': 'How Fast Fashio...",1
GPS,306,GAP,GPS,Retail Trade,Apparel/Footwear Retail,"The Gap, Inc., commonly known as Gap Inc. or G...","[{'doc_id': 37, 'title': 'Macy’s And Kohl’s De...",1
ANF,6,Abercrombie & Fitch,ANF,Retail Trade,Apparel/Footwear Retail,Abercrombie & Fitch (A&F) is an American lifes...,"[{'doc_id': 37, 'title': 'Macy’s And Kohl’s De...",1
CHS,160,Chico’s FAS,CHS,Retail Trade,Apparel/Footwear Retail,Chico's FAS is an American women's clothing an...,"[{'doc_id': 37, 'title': 'Macy’s And Kohl’s De...",1


('Retail Trade', 'Drugstore Chains')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
RAD,592,Rite Aid,RAD,Retail Trade,Drugstore Chains,Rite Aid Corporation is an American drugstore ...,"[{'doc_id': 36, 'title': 'Kohls CEO talks Amaz...",1


('Retail Trade', 'Electronics/Appliance Stores')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
GME,305,GameStop,GME,Retail Trade,Electronics/Appliance Stores,"GameStop Corp. is an American video game, cons...","[{'doc_id': 'oos_3', 'title': 'The Reddit revo...",1


('Retail Trade', 'Specialty Stores')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
CVNA,140,Carvana Co.,CVNA,Retail Trade,Specialty Stores,"Carvana, based in Tempe, Arizona, is a leading...","[{'doc_id': 1, 'title': 'Ford to build $11.4 b...",1
CHPT,150,ChargePoint Holdings,CHPT,Retail Trade,Specialty Stores,ChargePoint (formerly Coulomb Technologies) is...,"[{'doc_id': 10, 'title': 'Fresno County Will S...",1
ULTA,702,Ulta Beauty,ULTA,Retail Trade,Specialty Stores,"Ulta Beauty, Inc., formerly known as Ulta Salo...","[{'doc_id': 36, 'title': 'Kohls CEO talks Amaz...",1
TGT,666,Target,TGT,Retail Trade,Specialty Stores,Target Corporation is an American retail corpo...,"[{'doc_id': 37, 'title': 'Macy’s And Kohl’s De...",1


('Technology Services', 'Data Processing Services')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
DXC,246,DXC Technology,DXC,Technology Services,Data Processing Services,DXC Technology is an American multinational co...,"[{'doc_id': 'oos_5', 'title': 'COVID-19: What ...",1
PYPL,543,PayPal,PYPL,Technology Services,Data Processing Services,"PayPal Holdings, Inc. is an American multinati...","[{'doc_id': 17, 'title': 'In a huge blow, judg...",1
FISV,292,Fiserv,FISV,Technology Services,Data Processing Services,"Fiserv, Inc. (/faɪˈsərv/) is an American multi...","[{'doc_id': 26, 'title': 'Itaú Unibanco Holdin...",1


('Technology Services', 'Information Technology Services')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
SSNC,641,SS&C Technologies Holdings,SSNC,Technology Services,Information Technology Services,"SS&C Technologies Holdings, Inc. (known as SS&...","[{'doc_id': 22, 'title': 'Will SoFi Technologi...",2
BOX,114,Box,BOX,Technology Services,Information Technology Services,"Box, Inc. (formerly Box.net), is an American i...","[{'doc_id': 22, 'title': 'Will SoFi Technologi...",2
MOMO,471,Momo,MOMO,Technology Services,Information Technology Services,Momo (Chinese: 陌陌; pinyin: mò mò) is a free so...,"[{'doc_id': 39, 'title': 'Alibaba stock tumble...",2
VMW,733,VMware,VMW,Technology Services,Information Technology Services,"VMware, Inc. is an American cloud computing an...","[{'doc_id': 'oos_9', 'title': 'The privacy par...",1
SQSP,640,Squarespace,SQSP,Technology Services,Information Technology Services,"Squarespace, Inc. is an American website build...","[{'doc_id': 22, 'title': 'Will SoFi Technologi...",1
WDAY,757,Workday,WDAY,Technology Services,Information Technology Services,"Workday, Inc., is an American on‑demand (cloud...","[{'doc_id': 36, 'title': 'Kohls CEO talks Amaz...",1


('Transportation', 'Railroads')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
UNP,706,Union Pacific,UNP,Transportation,Railroads,The Union Pacific Corporation (Union Pacific) ...,"[{'doc_id': 34, 'title': 'US Steel Output Spik...",1


('Utilities', 'Alternative Power Generation')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
BIP,121,Brookfield Infrastructure Partners,BIP,Utilities,Alternative Power Generation,Brookfield Infrastructure Partners L.P. is a p...,"[{'doc_id': 19, 'title': 'Why Chevron And Exxo...",2
RUN,653,Sunrun,RUN,Utilities,Alternative Power Generation,Sunrun Inc. is an American provider of residen...,"[{'doc_id': 19, 'title': 'Why Chevron And Exxo...",1


('Utilities', 'Gas Distributors')


Unnamed: 0,index,stock_name,ticker_symbol,sector,industry,comment,relevant_articles,article_count
SRE,611,Sempra Energy,SRE,Utilities,Gas Distributors,Sempra is a North American energy infrastructu...,"[{'doc_id': 'oos_2', 'title': 'COP26 Climate S...",2


In [22]:
print(len(set_aside_stocks))
set_aside_stocks.keys()

75


dict_keys(['LI', 'PTRA', 'NRG', 'F', 'HES', 'EQNR', 'RIO', 'TECK', 'XPEV', 'NIO', 'TSLA', 'NDAQ', 'BABA', 'XLNX', 'HIMX', 'DIDI', 'BNTX', 'MRNA', 'ABEV', 'NCLH', 'CCL', 'AMC', 'EDU', 'TME', 'WDC', 'PLUG', 'AMD', 'YY', 'TSM', 'NVDA', 'INTC', 'AAPL', 'PBR', 'SCCO', 'VALE', 'SID', 'SOFI', 'BSBR', 'C', 'ITUB', 'EVR', 'ET', 'KSS', 'M', 'JD', 'VIPS', 'IQ', 'UBER', 'AAL', 'T', 'BBD', 'CVX', 'XOM', 'FCX', 'PFE', 'X', 'ROKU', 'VFC', 'KDP', 'ATVI', 'SPCE', 'NOV', 'KHC', 'ESI', 'CHL', 'COTY', 'FCEL', 'MPC', 'KKR', 'INO', 'IAG', 'RL', 'SSNC', 'BIP', 'SRE'])

In [23]:
for pairing in unrepresented_sectors_industries:
    print(pairing)

('Commercial Services', 'Advertising/Marketing Services')
('Commercial Services', 'Financial Publishing/Services')
('Communications', 'Specialty Telecommunications')
('Communications', 'Telecommunications Equipment')
('Communications', 'Wireless Telecommunications')
('Consumer Durables', 'Homebuilding')
('Consumer Non-Durables', 'Household/Personal Care')
('Consumer Services', 'Broadcasting')
('Consumer Services', 'Cable/Satellite TV')
('Consumer Services', 'Restaurants')
('Electronic Technology', 'Computer Communications')
('Electronic Technology', 'Computer Processing Hardware')
('Electronic Technology', 'Electronic Equipment/Instruments')
('Energy Minerals', 'Oil Refining/Marketing')
('Finance', 'Investment Managers')
('Finance', 'Life/Health Insurance')
('Finance', 'Real Estate Development')
('Finance', 'Real Estate Investment Trusts')
('Health Technology', 'Medical Specialties')
('Health Technology', 'Pharmaceuticals: Generic')
('Industrial Services', 'Environmental Services')
('N

### Adding irrelevant articles

In [24]:
set_aside_stocks.keys()

dict_keys(['LI', 'PTRA', 'NRG', 'F', 'HES', 'EQNR', 'RIO', 'TECK', 'XPEV', 'NIO', 'TSLA', 'NDAQ', 'BABA', 'XLNX', 'HIMX', 'DIDI', 'BNTX', 'MRNA', 'ABEV', 'NCLH', 'CCL', 'AMC', 'EDU', 'TME', 'WDC', 'PLUG', 'AMD', 'YY', 'TSM', 'NVDA', 'INTC', 'AAPL', 'PBR', 'SCCO', 'VALE', 'SID', 'SOFI', 'BSBR', 'C', 'ITUB', 'EVR', 'ET', 'KSS', 'M', 'JD', 'VIPS', 'IQ', 'UBER', 'AAL', 'T', 'BBD', 'CVX', 'XOM', 'FCX', 'PFE', 'X', 'ROKU', 'VFC', 'KDP', 'ATVI', 'SPCE', 'NOV', 'KHC', 'ESI', 'CHL', 'COTY', 'FCEL', 'MPC', 'KKR', 'INO', 'IAG', 'RL', 'SSNC', 'BIP', 'SRE'])

In [25]:
for _, stock in set_aside_stocks.items():
    stock["irrelevant_articles"] = list()
    ticker_symbol = stock["ticker_symbol"]
    for _, article in articles.items():
        revelant_tickers = {_["ticker_symbol"] for _ in article["stock_matches"]}
        if ticker_symbol not in revelant_tickers:
            article_ = {k:v for k, v in article.items() if k in ["doc_id", "title", "url", "summary"]}
            stock["irrelevant_articles"].append(article_)

### Ranking article matches

In [26]:
for _, stock in set_aside_stocks.items():
    stock["relevant_articles"] = sorted(stock["relevant_articles"], 
                                        key=lambda x: x["score"], 
                                        reverse=True)

### Saving to file

In [27]:
with open("../../data/test/user_study/stocks/stocks.v3.json", "w") as fp:
    json.dump(set_aside_stocks, fp, ensure_ascii=False, indent="\t")

In [28]:
len(set_aside_stocks)

75