In [1]:
import json
from string import Template
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook
from urllib.error import HTTPError
from SPARQLWrapper import SPARQLWrapper, JSON

from IPython.display import display

In [2]:
def read_jsonl_data(filename):
    objects = list()
    with open(filename, "r", encoding="utf8") as fp:
        for line in fp:
            obj = json.loads(line)
            objects.append(obj)
    return objects


stocks_path = Path("../../data/raw/stocks//revolut.2021-07-05.jsonl")
stocks_sample_path = Path("../../data/samples/sample_stocks/revolut.top50.2021-08-19T17:50:12.jsonl")

stocks = read_jsonl_data(stocks_path)
stocks_sample = read_jsonl_data(stocks_sample_path)

In [3]:
stocks_in_sample = set(stock["ticker"] for stock in stocks_sample 
                       if stock["ticker"] != "TIGR")

print(stocks_in_sample)

{'NIO', 'C', 'MU', 'X', 'AMD', 'BABA', 'GM', 'AAL', 'OXY', 'MSFT', 'BBD', 'DIDI', 'TSM', 'FCX', 'EDU', 'BAC', 'XOM', 'MRNA', 'NVDA', 'VIPS', 'VALE', 'ITUB', 'PFE', 'CSCO', 'TSLA', 'AA', 'NCLH', 'IQ', 'PLTR', 'F', 'JD', 'KSS', 'SOFI', 'ET', 'INTC', 'CLF', 'SLB', 'TME', 'PBR', 'ABEV', 'UBER', 'PLUG', 'AMC', 'BP', 'M', 'AAPL', 'CVX', 'T', 'CCL'}


In [4]:
# stocks = [stock for stock in stocks
#           if not stock["ticker"] in stocks_in_sample]

print(len(stocks))

915


In [5]:
def make_clickable(link):
    if link is not np.NaN:
        text = link.split('=')[0]
        return f'<a target="_blank" href="{link}">{text}</a>'
    else: 
        return link

    
def mask(df):
    return (df.company_name.notna() 
            & df.wiki_page_id.notna())

In [6]:
QUERY = Template("""
SELECT *  
WHERE {
        ?endpoint dbp:google|dbp:yahoo|dbp:nasdaq "$ticker"@en; 
                  foaf:name ?company_name;
                  dbo:wikiPageID ?wiki_page_id
        OPTIONAL {
            ?endpoint rdfs:comment ?comment 
            FILTER (lang(?comment) = "en")
        }
        FILTER (lang(?company_name ) = "en")  
      } LIMIT 1
""")

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

for stock in tqdm_notebook(stocks):
    sparql.setQuery(QUERY.substitute(ticker=stock["ticker"])) 
    
    try:
        response = sparql.query().convert()
        vars_ = response["head"]["vars"]
        bindings = response["results"]["bindings"]
        
    except HTTPError as e:
        print(f"{stock['ticker']}: {e}")
        bindings = None
        
    if bindings:
        binding = bindings[0]
        for var in vars_:
            stock[var] = binding[var]["value"] if var in binding else None
            

stocks_df = pd.DataFrame(stocks)

  0%|          | 0/915 [00:00<?, ?it/s]

In [7]:
info_captured = mask(stocks_df)

with pd.option_context("max_rows", None):
    display(stocks_df[info_captured].style.format({'endpoint': make_clickable}))

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
0,21st Century Fox,FOXA,Consumer Services,Broadcasting,http://dbpedia.org/resource/Fox_Corporation,Fox Corporation,58734970,"Fox Corporation is an American mass media company operated and owned by media mogul Rupert Murdoch and headquartered at 1211 Avenue of the Americas in New York City. The company was formed in 2019 as a result of the acquisition of 21st Century Fox by The Walt Disney Company; the assets that were not acquired by Disney were spun off from 21st Century Fox as the new Fox Corp., and its stock began trading on January 1, 2019. The company is incorporated in Delaware."
1,23andMe Holding Co,ME,Finance,Financial Conglomerates,http://dbpedia.org/resource/23andMe,"23andMe, Inc.",11979083,"23andMe, Inc. is a publicly held personal genomics and biotechnology company based in Sunnyvale, California. It is best known for providing a direct-to-consumer genetic testing service in which customers provide a saliva sample that is laboratory analysed, using single nucleotide polymorphism genotyping, to generate reports relating to the customer's ancestry and genetic predispositions to health-related topics. The company's name is derived from the fact that there are 23 pairs of chromosomes in a wildtype human cell."
2,2U,TWOU,Technology Services,Packaged Software,http://dbpedia.org/resource/2U_(company),"2U, Inc.",37371846,"2U, Inc. (formerly 2tor Inc.) is an American educational technology company that contracts with non-profit colleges and universities to offer online degree programs. The company supplies its client institutions with a cloud-based software-as-a-service platform, coursework design, infrastructure support, and capital."
3,3M,MMM,Producer Manufacturing,Industrial Conglomerates,http://dbpedia.org/resource/3M,3M Company,7664801,"The 3M Company is an American multinational conglomerate corporation operating in the fields of industry, worker safety, US health care, and consumer goods. The company produces over 60,000 products under several brands, including adhesives, abrasives, laminates, passive fire protection, personal protective equipment, window films, paint protection films, dental and orthodontic products, electrical and electronic connecting and insulating materials, medical products, car-care products, electronic circuits, healthcare software and optical films. It is based in Maplewood, a suburb of Saint Paul, Minnesota."
4,Abbott Labs,ABT,Health Technology,Medical Specialties,http://dbpedia.org/resource/Abbott_Laboratories,Abbott Laboratories,488730,"Abbott Laboratories is an American multinational medical devices and health care company with headquarters in Abbott Park, Illinois, United States. The company was founded by Chicago physician Wallace Calvin Abbott in 1888 to formulate known drugs; today, it sells medical devices, diagnostics, branded generic medicines and nutritional products. It split off its research-based pharmaceuticals business into AbbVie in 2013."
5,AbbVie,ABBV,Health Technology,Pharmaceuticals: Major,http://dbpedia.org/resource/AbbVie,AbbVie Inc.,37665564,AbbVie is an American publicly traded biopharmaceutical company founded in 2013. It originated as a spin-off of Abbott Laboratories.
6,Abercrombie & Fitch,ANF,Retail Trade,Apparel/Footwear Retail,http://dbpedia.org/resource/Abercrombie_&_Fitch,Abercrombie & Fitch Co.,17255339,"Abercrombie & Fitch (A&F) is an American lifestyle retailer that focuses on casual wear. Its headquarters are in New Albany, Ohio. The company operates three other offshoot brands: Abercrombie Kids, Hollister Co., and Gilly Hicks As of February 2020, the company operated 854 stores across all brands."
7,Accenture PLC,ACN,Technology Services,Information Technology Services,http://dbpedia.org/resource/Accenture,Accenture plc,299134,"Accenture plc is an Irish-based multinational company that provides consulting and professional services. A Fortune Global 500 company, it reported revenues of $44.33 billion in 2020 and had 569,000 employees. In 2015, the company had about 150,000 employees in India, 48,000 in the US, and 50,000 in the Philippines. Accenture's current clients include 91 of the Fortune Global 100 and more than three-quarters of the Fortune Global 500. On 11 July 2019, Accenture appointed Julie Sweet as its new chief executive officer. She accepted her office on 1 September 2019."
9,Activision Blizzard,ATVI,Consumer Durables,Recreational Products,http://dbpedia.org/resource/Activision_Blizzard,"Activision Blizzard, Inc.",14527195,"Activision Blizzard, Inc. is an American video game holding company based in Santa Monica, California. The company was founded in July 2008 through the merger of Activision, Inc. (the publicly traded parent company of Activision Publishing) and Vivendi Games. The company is traded on the NASDAQ stock exchange under the ticker symbol ATVI, and since 2015 has been one of the stocks that make up the S&P 500. Activision Blizzard currently includes five business units: Activision Publishing, Blizzard Entertainment, King, Major League Gaming, and Activision Blizzard Studios."
11,Advance Auto Parts,AAP,Retail Trade,Specialty Stores,http://dbpedia.org/resource/Advance_Auto_Parts,"Advance Auto Parts, Inc.",9401450,"Advance Auto Parts, Inc. (Advance) is an American automotive aftermarket parts provider. Headquartered in Raleigh, North Carolina, it serves both professional installer and do-it-yourself (DIY) customers. As of July 13, 2019, Advance operated 4,912 stores and 150 Worldpac branches in the United States and Canada. The Company also serves 1,250 independently owned Carquest branded stores across these locations in addition to Mexico, the Bahamas, Turks and Caicos and British Virgin Islands. The company's stores and branches offer a broad selection of brand name, original equipment manufacturer (OEM) and private label automotive replacement parts, accessories, batteries and maintenance items for domestic and imported cars, vans, sport utility vehicles and light and heavy duty trucks."


In [8]:
print(len(stocks_df[info_captured]), len(stocks_df[~info_captured]))

531 384


In [9]:
stocks_df[stocks_df.ticker.isin(stocks_in_sample) & info_captured]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
12,Advanced Micro Devices,AMD,Electronic Technology,Semiconductors,http://dbpedia.org/resource/Advanced_Micro_Dev...,"Advanced Micro Devices, Inc.",2400,"Advanced Micro Devices, Inc. (AMD) is an Ameri..."
25,Alcoa,AA,Non-Energy Minerals,Aluminum,http://dbpedia.org/resource/Alcoa,Alcoa Corporation,479944,Alcoa Corporation (a portmanteau of Aluminum C...
27,Alibaba,BABA,Retail Trade,Internet Retail,http://dbpedia.org/resource/Alibaba_Group,Alibaba Group Holding Limited,2430273,"Alibaba Group Holding Limited, also known as A..."
40,AMC,AMC,Consumer Services,Movies/Entertainment,http://dbpedia.org/resource/AMC_Theatres,"AMC Entertainment Holdings, Inc.",563739,"AMC Entertainment Holdings, Inc. (d/b/a AMC Th..."
42,American Airlines,AAL,Transportation,Airlines,http://dbpedia.org/resource/American_Airlines_...,American Airlines Group Inc.,38521579,American Airlines Group Inc. is an American pu...
70,Apple,AAPL,Electronic Technology,Telecommunications Equipment,http://dbpedia.org/resource/Apple_Inc.,Apple Inc.,856,Apple Inc. is an American multinational techno...
86,AT&T,T,Communications,Major Telecommunications,http://dbpedia.org/resource/AT&T,AT&T Inc.,17555269,AT&T Inc. is an American multinational conglom...
108,Bank of America,BAC,Finance,Major Banks,http://dbpedia.org/resource/Bank_of_America,Bank of America,347756,The Bank of America Corporation (simply referr...
137,BP plc,BP,Energy Minerals,Integrated Oil,http://dbpedia.org/resource/BP,BP plc,18998720,"BP plc (official styling BP p.l.c., formerly T..."
166,Carnival,CCL,Consumer Services,Hotels/Resorts/Cruise lines,http://dbpedia.org/resource/Carnival_Corporati...,Carnival Corporation & plc,428998,Carnival Corporation & plc is a British-Americ...


In [10]:
print(len(stocks_df[stocks_df.ticker.isin(stocks_in_sample) & info_captured]))
print(len(stocks_df[stocks_df.ticker.isin(stocks_in_sample) & ~info_captured]))

34
15


In [11]:
stocks_df.loc[[55, 154, 304, 716, 849]]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
55,Analog Devices,ADI,Electronic Technology,Semiconductors,http://dbpedia.org/resource/Analog_Devices,"Analog Devices, Inc.",644341,"Analog Devices, Inc. (ADI), also known simply ..."
154,Cable One,CABO,Consumer Services,Cable/Satellite TV,http://dbpedia.org/resource/Cable_One,Sparklight,851290,"Cable One, Inc. (NYSE: CABO) is an American br..."
304,Embraer,ERJ,Electronic Technology,Aerospace & Defense,http://dbpedia.org/resource/Embraer,,182047,Embraer S.A. (Portuguese pronunciation: [ẽmbɾa...
716,Sea Limited,SE,Technology Services,Internet Software/Services,http://dbpedia.org/resource/Spectra_Energy,Spectra Energy Corporation,13131365,"Spectra Energy Corp, headquartered in Houston,..."
849,Veeva Systems,VEEV,Technology Services,Packaged Software,http://dbpedia.org/resource/Veeva_Systems,,50146755,Veeva Systems Inc. is an American cloud-comput...


In [12]:
# Minor corrections
stocks_df.loc[55, "endpoint"] = "https://dbpedia.org/resource/Analog_Devices"
stocks_df.loc[55, "company_name"] = "Analog Devices, Inc."
stocks_df.loc[55, "wiki_page_id"] = 644341
stocks_df.loc[55, "comment"] = "Analog Devices, Inc. (ADI), also known simply as Analog, is an American multinational semiconductor company specializing in data conversion, signal processing and power management technology, headquartered in Wilmington, Massachusetts. In 2012, Analog Devices led the worldwide data converter market with a 48.5% share, according to analyst firm Databeans.The company manufactures analog, mixed-signal and digital signal processing (DSP) integrated circuits (ICs) used in electronic equipment."

stocks_df.loc[154, "company_name"] = "CableOne"
stocks_df.loc[304, "company_name"] = "Embraer"
stocks_df.loc[849, "company_name"] = "Veeva Systems Inc."

In [13]:
stocks_incomplete_df = pd.concat([stocks_df.iloc[[716]], 
                                  stocks_df[~info_captured]]) \
                         .sort_index()

stocks_complete_df = stocks_df.iloc[list(set(stocks_df.index) 
                                         - set(stocks_incomplete_df.index))]

In [14]:
QUERY = Template("""
SELECT *  
WHERE { ?endpoint dbp:google|dbp:yahoo|dbp:nasdaq "$ticker"@en; 
                  dbp:name ?company_name;
                  dbo:wikiPageID ?wiki_page_id
        OPTIONAL {
            ?endpoint rdfs:comment ?comment 
            FILTER (lang(?comment) = "en")
        }
        FILTER (lang(?company_name ) = "en")           
      } LIMIT 1
""")

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

_stocks = stocks_incomplete_df.to_dict(orient="index")

for i, stock in tqdm_notebook(_stocks.items()):
    sparql.setQuery(QUERY.substitute(ticker=stock["ticker"]))
    response = sparql.query().convert()
    vars_ = response["head"]["vars"]
    bindings = response["results"]["bindings"]
    if bindings:
        binding = bindings[0]
        for var in vars_:
            stock[var] = binding[var]["value"] if var in binding else None

stocks_incomplete_df = pd.DataFrame.from_dict(_stocks, orient="index")

  0%|          | 0/385 [00:00<?, ?it/s]

In [15]:
info_captured = mask(stocks_incomplete_df)

with pd.option_context("max_rows", None):
    display(stocks_incomplete_df[info_captured].style.format({'endpoint': make_clickable}))

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
10,Adobe,ADBE,Technology Services,Packaged Software,http://dbpedia.org/resource/Adobe_Inc.,Adobe Inc.,1955,"Adobe Inc. (/əˈdoʊbiː/ ə-DOH-bee), originally called Adobe Systems Incorporated, is an American multinational computer software company. Incorporated in Delaware and headquartered in San Jose, California, it has historically specialized in software for the creation and publication of a wide range of content, including graphics, photography, illustration, animation, multimedia/video, motion pictures and print. The company has expanded into digital marketing management software. Adobe has millions of users worldwide. Flagship products include: Photoshop image editing software, Adobe Illustrator vector-based illustration software, Adobe Acrobat Reader and the Portable Document Format (PDF), plus a host of tools primarily for audio-visual content creation, editing and publishing. The company b"
97,AXA Equitable Holdings,EQH,Finance,Financial Conglomerates,http://dbpedia.org/resource/Equitable_Holdings,"Equitable Holdings, Inc.",11820603,"Equitable Holdings, Inc. (formerly The Equitable Life Assurance Society of the United States and AXA Equitable Life Insurance Company, and also known as The Equitable) is an American financial services and insurance company that was founded in 1859 by Henry Baldwin Hyde. In 1991, French insurance firm AXA acquired majority control of The Equitable."
116,Belden,BDC,Producer Manufacturing,Electrical Products,http://dbpedia.org/resource/Belden_(electronics_company),Belden Incorporated,31295131,"Belden Incorporated is an American manufacturer of networking, connectivity, and cable products. The company designs, manufactures, and markets signal transmission products for demanding applications. These products serve the industrial automation, enterprise, security, transportation, infrastructure, and residential markets. Belden is one of the largest U.S.-based manufacturers of high-speed electronic cables primarily used in industrial, enterprise, and broadcast markets."
129,Blackstone,BX,Finance,Investment Managers,http://dbpedia.org/resource/The_Blackstone_Group,Blackstone Inc.,885066,"Blackstone Inc. is an American alternative investment management company based in New York City. In 2019, Blackstone converted from a publicly traded partnership into a corporation. Blackstone's private equity business has been one of the largest investors in leveraged buyouts in the last three decades, while its real estate business has actively acquired commercial real estate. As of 2020, the company's total assets under management were approximately US$619 billion."
201,Cisco,CSCO,Technology Services,Information Technology Services,http://dbpedia.org/resource/Cisco_Systems,"Cisco Systems, Inc.",51746,"Cisco Systems, Inc. is an American multinational technology conglomerate headquartered in San Jose, California, in the center of Silicon Valley. Cisco develops, manufactures and sells networking hardware, software, telecommunications equipment and other high-technology services and products. Through its numerous acquired subsidiaries, such as OpenDNS, Webex, Jabber and Jasper, Cisco specializes in specific tech markets, such as the Internet of Things (IoT), domain security and energy management. On January 25, 2021, Cisco reincorporated in Delaware."
204,Citrix Systems,CTXS,Technology Services,Packaged Software,http://dbpedia.org/resource/Citrix_Systems,"Citrix Systems, Inc.",309948,"Citrix Systems, Inc. is an American multinational software company that provides server, application and desktop virtualization, networking, software as a service (SaaS), and cloud computing technologies. Citrix products are claimed to be in use by over 400,000 clients worldwide, including 99% of the Fortune 100, and 98% of the Fortune 500."
214,Coca-Cola,KO,Consumer Non-Durables,Beverages: Non-Alcoholic,http://dbpedia.org/resource/The_Coca-Cola_Company,The Coca-Cola Co,914869,"The Coca-Cola Company is an American multinational beverage corporation incorporated under Delaware's General Corporation Law and headquartered in Atlanta, Georgia. The Coca-Cola Company has interests in the manufacturing, retailing, and marketing of nonalcoholic beverage concentrates and syrups, and alcoholic beverages. The company produces Coca-Cola, the sugary drink for which its best known, invented in 1886 by pharmacist John Stith Pemberton. At the time, the product was made with coca leaves, which added an amount of cocaine to the drink, and with kola nuts, which added caffeine, so that the coca and the kola together provided a stimulative effect. This stimulative effect is the reason the drink was sold to the public as a healthy ""tonic,"" and the coca and the kola are also the sour"
237,Corning,GLW,Electronic Technology,Electronic Components,http://dbpedia.org/resource/Corning_Inc.,Corning Incorporated,342837,"Corning Incorporated is an American multinational technology company that specializes in specialty glass, ceramics, and related materials and technologies including advanced optics, primarily for industrial and scientific applications. The company was named Corning Glass Works until 1989. Corning divested its consumer product lines (including CorningWare and Visions Pyroceram-based cookware, Corelle Vitrelle tableware, and Pyrex glass bakeware) in 1998 by selling the Corning Consumer Products Company subsidiary (now known as Corelle Brands) to Borden, but still holds an interest of about 8 percent."
263,Deutsche Bank AG,DB,Finance,Major Banks,http://dbpedia.org/resource/Deutsche_Bank,Deutsche Bank AG,523937,"Deutsche Bank AG (German pronunciation: [ˈdɔʏtʃə ˈbaŋk ʔaːˈɡeː]) is a German multinational investment bank and financial services company headquartered in Frankfurt, Germany, and dual-listed on the Frankfurt Stock Exchange and the New York Stock Exchange. The company is a universal bank with four major divisions: Investment Bank, Corporate Bank, Private Bank and Asset Management (DWS). Its investment banking operations often command substantial deal flow."
305,Emerson,EMR,Producer Manufacturing,Electrical Products,http://dbpedia.org/resource/Emerson_Electric,Emerson Electric Co.,1175476,"Emerson Electric Co. is an American multinational corporation headquartered in Ferguson, Missouri. The Fortune 500 company manufactures products and provides engineering services for a wide range of industrial, commercial, and consumer markets.Emerson has approximately 83,500 employees and 200 manufacturing locations worldwide."


In [16]:
print(len(stocks_incomplete_df[info_captured]), len(stocks_incomplete_df[~info_captured]))

29 356


In [17]:
stocks_incomplete_df[stocks_incomplete_df.ticker.isin(stocks_in_sample) & info_captured]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
201,Cisco,CSCO,Technology Services,Information Technology Services,http://dbpedia.org/resource/Cisco_Systems,"Cisco Systems, Inc.",51746,"Cisco Systems, Inc. is an American multination..."
448,Intel,INTC,Electronic Technology,Semiconductors,http://dbpedia.org/resource/Intel,Intel Corp.,14617,Intel Corporation is an American multinational...
554,Micron Technology,MU,Electronic Technology,Semiconductors,http://dbpedia.org/resource/Micron_Technology,"Micron Technology, Inc.",487445,"Micron Technology, Inc. is an American produce..."


In [18]:
stocks_incomplete_df[stocks_incomplete_df.ticker.isin(stocks_in_sample) & ~info_captured]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
39,Ambev,ABEV,Consumer Non-Durables,Beverages: Alcoholic,,,,
105,Banco Bradesco S.A. – Preferred Shares,BBD,Finance,Major Banks,,,,
268,DiDi Global Inc.,DIDI,Technology Services,Packaged Software,,,,
307,Energy Transfer Equity,ET,Industrial Services,Oil & Gas Pipelines,,,,
463,iQIYI,IQ,Technology Services,Internet Software/Services,,,,
466,Itaú Unibanco,ITUB,Finance,Major Banks,,,,
583,New Oriental Education & Tech Group,EDU,Consumer Services,Other Consumer Services,,,,
649,Petroleo Brasileiro S.A.,PBR,Energy Minerals,Integrated Oil,,,,
738,SoFi Technologies,SOFI,Finance,Finance/Rental/Leasing,,,,
790,Tencent Music Entertainment Group,TME,Technology Services,Internet Software/Services,,,,


In [19]:
stocks_complete_df = pd.concat([stocks_complete_df, 
                                stocks_incomplete_df[info_captured]]) \
                         .sort_index()

stocks_incomplete_df = stocks_incomplete_df[~info_captured]

In [20]:
stocks_incomplete_df = (stocks_incomplete_df.append(stocks_complete_df.loc[716])                                
                                            .sort_index())

stocks_complete_df = stocks_complete_df.drop(index=716)

In [21]:
with pd.option_context("max_rows", None):
    display(stocks_complete_df)

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
0,21st Century Fox,FOXA,Consumer Services,Broadcasting,http://dbpedia.org/resource/Fox_Corporation,Fox Corporation,58734970,Fox Corporation is an American mass media comp...
1,23andMe Holding Co,ME,Finance,Financial Conglomerates,http://dbpedia.org/resource/23andMe,"23andMe, Inc.",11979083,"23andMe, Inc. is a publicly held personal geno..."
2,2U,TWOU,Technology Services,Packaged Software,http://dbpedia.org/resource/2U_(company),"2U, Inc.",37371846,"2U, Inc. (formerly 2tor Inc.) is an American e..."
3,3M,MMM,Producer Manufacturing,Industrial Conglomerates,http://dbpedia.org/resource/3M,3M Company,7664801,The 3M Company is an American multinational co...
4,Abbott Labs,ABT,Health Technology,Medical Specialties,http://dbpedia.org/resource/Abbott_Laboratories,Abbott Laboratories,488730,Abbott Laboratories is an American multination...
5,AbbVie,ABBV,Health Technology,Pharmaceuticals: Major,http://dbpedia.org/resource/AbbVie,AbbVie Inc.,37665564,AbbVie is an American publicly traded biopharm...
6,Abercrombie & Fitch,ANF,Retail Trade,Apparel/Footwear Retail,http://dbpedia.org/resource/Abercrombie_&_Fitch,Abercrombie & Fitch Co.,17255339,Abercrombie & Fitch (A&F) is an American lifes...
7,Accenture PLC,ACN,Technology Services,Information Technology Services,http://dbpedia.org/resource/Accenture,Accenture plc,299134,Accenture plc is an Irish-based multinational ...
9,Activision Blizzard,ATVI,Consumer Durables,Recreational Products,http://dbpedia.org/resource/Activision_Blizzard,"Activision Blizzard, Inc.",14527195,"Activision Blizzard, Inc. is an American video..."
10,Adobe,ADBE,Technology Services,Packaged Software,http://dbpedia.org/resource/Adobe_Inc.,Adobe Inc.,1955,"Adobe Inc. (/əˈdoʊbiː/ ə-DOH-bee), originally ..."


In [22]:
with pd.option_context("max_rows", None):
    display(stocks_incomplete_df)

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
8,ACE Convergence Acquisition Corp,ACEV,Finance,Financial Conglomerates,,,,
13,Affirm Holdings,AFRM,Technology Services,Internet Software/Services,,,,
16,Agilon Health,AGL,Health Services,Medical/Nursing Services,,,,
17,AGNC Investment,AGNC,Finance,Real Estate Investment Trusts,,,,
22,Ajax I,AJAX,Finance,Financial Conglomerates,,,,
26,Alexion Pharmaceuticals,ALXN,Health Technology,Biotechnology,,,,
29,Allot Communications,ALLT,Electronic Technology,Computer Communications,,,,
39,Ambev,ABEV,Consumer Non-Durables,Beverages: Alcoholic,,,,
41,Ameresco,AMRC,Industrial Services,Engineering & Construction,,,,
49,Americold Realty Trust,COLD,Finance,Real Estate Investment Trusts,,,,


In [23]:
QUERY = Template("""
SELECT *  
WHERE { ?endpoint dbp:tradedAs "NYSE:$ticker"@en;
                  dbp:name ?company_name;
                  dbo:wikiPageID ?wiki_page_id
        OPTIONAL {
            ?endpoint rdfs:comment ?comment 
            FILTER (lang(?comment) = "en")
        }
        FILTER (lang(?company_name ) = "en")           
      } LIMIT 1
""")

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

_stocks = stocks_incomplete_df.to_dict(orient="index")

for i, stock in tqdm_notebook(_stocks.items()):
    sparql.setQuery(QUERY.substitute(ticker=stock["ticker"]))
    response = sparql.query().convert()
    vars_ = response["head"]["vars"]
    bindings = response["results"]["bindings"]
    if bindings:
        binding = bindings[0]
        for var in vars_:
            stock[var] = binding[var]["value"] if var in binding else None

stocks_incomplete_df = pd.DataFrame.from_dict(_stocks, orient="index")

  0%|          | 0/357 [00:00<?, ?it/s]

In [24]:
info_captured = mask(stocks_incomplete_df)

with pd.option_context("max_rows", None):
    display(stocks_incomplete_df[info_captured].style.format({'endpoint': make_clickable}))

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
39,Ambev,ABEV,Consumer Non-Durables,Beverages: Alcoholic,http://dbpedia.org/resource/Ambev,Ambev S.A.,1303503,"Ambev, formally Companhia de Bebidas das Américas (""Americas' Beverage Company"" in English, hence the ""Ambev"" abbreviation), is a Brazilian brewing company now merged into Anheuser-Busch InBev. It was created on July 1, 1999, with the merger of two breweries, Brahma and Antarctica. The merger was approved by the board of directors of the Brazilian Administrative Council for Economic Defense (CADE) on March 30, 2000. The headquarters are in São Paulo, Brazil. It is one of the largest companies by market capitalization in Brazil and in the Southern hemisphere."
85,At Home Group,HOME,Retail Trade,Specialty Stores,http://dbpedia.org/resource/At_Home_(store),At Home Group Inc.,1094460,"At Home specializes in home decor products and is based in Plano, Texas and currently operates 225 stores in 40 states. The average store is 110,000 square feet. Each store offers up to 50,000 home products that range from furniture, mirrors, rugs, art and housewares to tabletop, patio and seasonal decor. In August 2016, the company went public."
105,Banco Bradesco S.A. – Preferred Shares,BBD,Finance,Major Banks,http://dbpedia.org/resource/Banco_Bradesco,Banco Bradesco S.A.,497670,"Banco Bradesco S.A. is a Brazilian financial services company headquartered in Osasco, in the state of São Paulo, Brazil. It is the third largest banking institution in Brazil, as well as the third largest in Latin America, and the seventy-ninth largest bank in the world. It is also one of fifty most valuable banks in the world. The bank is listed at the B3 in São Paulo, where it is part of the Índice Bovespa, in the New York Stock Exchange and in the Madrid Stock Exchange."
106,Banco Macro,BMA,Finance,Regional Banks,http://dbpedia.org/resource/Banco_Macro,Banco Macro S.A.,23272176,"Banco Macro is the second largest domestically-owned private bank in Argentina, and the sixth-largest by deposits and lending."
112,Bausch Health Companies,BHC,Health Technology,Pharmaceuticals: Other,http://dbpedia.org/resource/Bausch_Health,Bausch Health Companies Inc.,2350163,"Bausch Health Companies Inc. (formerly Valeant Pharmaceuticals) is a multinational specialty pharmaceutical company based in Laval, Quebec, Canada. It develops, manufactures and markets pharmaceutical products and branded generic drugs, primarily for skin diseases, gastrointestinal disorders, eye health and neurology. Bausch Health owns Bausch & Lomb, a supplier of eye health products."
114,BBVA Banco Francés,BBAR,Finance,Regional Banks,http://dbpedia.org/resource/BBVA_Argentina,BBVA Argentina,9008919,"BBVA Argentina, formerly BBVA Banco Francés, is a financial institution in Argentina."
138,BRF,BRFS,Consumer Non-Durables,Food: Meat/Fish/Dairy,http://dbpedia.org/resource/BRF_S.A.,BRF S.A.,22910400,"BRF S.A. is a Brazilian company. BRF is one of the biggest food companies in the world, with over 30 brands in its portfolio, among them Sadia, Perdigão, Qualy, Paty, Dánica and Bocatti. Its products are sold in over 150 countries, in all five continents. More than 100 thousand employees work at the company, which owns more than 50 factories in eight countries: Argentina, Brazil, United Arab Emirates, Netherlands, Malaysia, United Kingdom, Thailand and Turkey."
142,Brookfield Asset Management Inc.,BAM,Finance,Investment Managers,http://dbpedia.org/resource/Brookfield_Asset_Management,Brookfield Asset Management Inc.,832941,"Brookfield Asset Management Inc. is one of the world’s largest alternative asset management company with US$626bn of AUM. It focuses on direct control investments in real estate, renewable power, infrastructure, credit and private equity. The Company invests in distressed securities through Oaktree Capital, which it bought in 2019. The firm also launched a reinsurance business in 2020. Brookfield’s headquarters is located in Toronto, and it also has corporate offices in New York City, London, São Paulo, Mumbai, Shanghai, Dubai, and Sydney."
145,Brookfield Infrastructure Partners,BIP,Utilities,Alternative Power Generation,http://dbpedia.org/resource/Brookfield_Infrastructure_Partners,Brookfield Infrastructure Partners L.P.,28118881,"Brookfield Infrastructure Partners L.P. is a publicly traded limited partnership with corporate headquarters in Toronto, Canada, that engages in the acquisition and management of infrastructure assets on a global basis. Until a spin-off in January 2008, Brookfield Infrastructure was an operating unit of Brookfield Asset Management, which retains a 30 percent ownership and acts as the partnership's general manager. The company's assets carried a book value of US$21.3 billion, on December 31, 2016."
147,Brookfield Renewable Partners,BEP,Utilities,Electric Utilities,http://dbpedia.org/resource/Brookfield_Renewable_Partners,Brookfield Renewable Partners L.P.,28560454,"Brookfield Renewable Partners L.P. is a publicly traded limited partnership that owns and operates renewable power assets, with corporate headquarters in Toronto, Ontario, Canada. It is 60% owned by Brookfield Asset Management. As of the end of 2017, Brookfield Renewable owned over 200 hydroelectric plants, 100 wind farms, over 550 solar facilities, and four storage facilities, with approximately 16,400 MW of installed capacity."


In [25]:
print(len(stocks_incomplete_df[info_captured]), len(stocks_incomplete_df[~info_captured]))

113 244


In [26]:
with pd.option_context("max_rows", None):
    display(stocks_incomplete_df.loc[[367, 481, 739, 793, 876]].style.format({'endpoint': make_clickable}))

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
367,Gartner,IT,Commercial Services,Miscellaneous Commercial Services,http://dbpedia.org/resource/Gartner,"Gartner, Inc.",250363,"Gartner, Inc, officially known as Gartner, is a global research and advisory firm providing information, advice, and tools for leaders in IT, finance, HR, customer service and support, communications, legal and compliance, marketing, sales, and supply chain functions. Its headquarters are in Stamford, Connecticut, United States. The firm changed its name from Gartner Group, Inc to Gartner in 2000. It is a member of the S&P 500."
481,Just Eat Takeaway,GRUB,Retail Trade,Specialty Stores,http://dbpedia.org/resource/Grubhub,Grubhub Inc.,40439540,"Grubhub Inc. is an American online and mobile prepared food ordering and delivery platform owned by Just Eat Takeaway that connects diners with local restaurants. The company is based in Chicago, Illinois and was founded in 2004. Their slogan is ""grub what you love."" As of 2019, the company had 19.9 million active users and 115,000 associated restaurants across 3,200 cities and all 50 states in the United States. Grubhub Seamless went public in April 2014 and is traded on the New York Stock Exchange (NYSE) under the ticker symbol ""GRUB""."
739,Sogou,SOGO,Technology Services,Internet Software/Services,http://dbpedia.org/resource/Sogou,搜狗公司,5367330,"Sogou, Inc. (Chinese: 搜狗; pinyin: Sōugǒu; lit. 'Search-dog') is a Chinese technology company that offers a search engine. It is a subsidiary of Sohu, Inc. The offices of Sogou are located on the southeast corner of Tsinghua University in Beijing. Sogou also has offices in Chengdu co-located with Tencent's office building. In April 2018, Sogou established a R&D center in Guangzhou."
793,Teva Pharmaceutical Industries,TEVA,Health Technology,Pharmaceuticals: Generic,http://dbpedia.org/resource/Teva_Pharmaceuticals,,5460692,"Teva Pharmaceutical Industries Ltd., also known as Teva Pharmaceuticals, is an Israeli-American multinational pharmaceutical company with headquarters in Petah Tikva, Israel and Parsippany, New Jersey, United States. It specializes primarily in generic drugs, but other business interests include active pharmaceutical ingredients and, to a lesser extent, proprietary pharmaceuticals. In 2020, Teva Pharmaceuticals was the second largest generic drug manufacturer and the eighteenth largest pharmaceutical company in the world."
876,Weibo,WB,Technology Services,Internet Software/Services,http://dbpedia.org/resource/Wachovia,Wachovia,20598893,"Wachovia was a diversified financial services company based in Charlotte, North Carolina. Before its acquisition by Wells Fargo and Company in 2008, Wachovia was the fourth-largest bank holding company in the United States, based on total assets. Wachovia provided a broad range of banking, asset management, wealth management, and corporate and investment banking products and services. At its height, it was one of the largest providers of financial services in the United States, operating financial centers in 21 states and Washington, D.C., with locations from Connecticut to Florida and west to California. Wachovia provided global services through more than 40 offices around the world."


In [27]:
# Minor corrections
stocks_incomplete_df.loc[367, "endpoint"] = "https://dbpedia.org/resource/Gartner"
stocks_incomplete_df.loc[367, "company_name"] = "Gartner, Inc."
stocks_incomplete_df.loc[367, "wiki_page_id"] = 250363
stocks_incomplete_df.loc[367, "comment"] = "Gartner, Inc, officially known as Gartner, is a Stamford, Connecticut-based technology research and consulting company. The company's products and services include research, executive programs, consulting, and conferences. Gartner clients include large corporations, government agencies, technology companies, and the investment community. In 2018, the company reported that its client base consisted of over 12,000 organizations in over 100 countries."

stocks_incomplete_df.loc[481, "endpoint"] = "https://dbpedia.org/resource/Just_Eat_Takeaway"
stocks_incomplete_df.loc[481, "company_name"] = "Just Eat Takeaway.com NV"
stocks_incomplete_df.loc[481, "wiki_page_id"] = 37100246
stocks_incomplete_df.loc[481, "comment"] = "Just Eat Takeaway.com N.V., formerly Thuisbezorgd.nl, and Takeaway.com, is a British-Dutch dot-com company specialising in online food ordering and home delivery. Takeaway.com is an intermediary online portal between the customer and the restaurants, where customers can order food online from restaurants’ menus, and have it delivered by the restaurants directly to their home."

stocks_incomplete_df.loc[706, "company_name"] = "Royal Bank of Canada"

stocks_incomplete_df.loc[739, "company_name"] = "Sogou, Inc."

stocks_incomplete_df.loc[793, "company_name"] = "Teva Pharmaceutical Industries Ltd."

In [28]:
with pd.option_context("max_rows", None):
    display(stocks_incomplete_df[~info_captured])

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
8,ACE Convergence Acquisition Corp,ACEV,Finance,Financial Conglomerates,,,,
13,Affirm Holdings,AFRM,Technology Services,Internet Software/Services,,,,
16,Agilon Health,AGL,Health Services,Medical/Nursing Services,,,,
17,AGNC Investment,AGNC,Finance,Real Estate Investment Trusts,,,,
22,Ajax I,AJAX,Finance,Financial Conglomerates,,,,
26,Alexion Pharmaceuticals,ALXN,Health Technology,Biotechnology,,,,
29,Allot Communications,ALLT,Electronic Technology,Computer Communications,,,,
41,Ameresco,AMRC,Industrial Services,Engineering & Construction,,,,
49,Americold Realty Trust,COLD,Finance,Real Estate Investment Trusts,,,,
52,Amicus Therapeutics,FOLD,Health Technology,Biotechnology,,,,


In [29]:
stocks_complete_df = pd.concat([stocks_complete_df, 
                                stocks_incomplete_df[info_captured]]) \
                         .sort_index()

stocks_incomplete_df = stocks_incomplete_df[~info_captured]

In [30]:
stocks_incomplete_df = (stocks_incomplete_df.append(stocks_complete_df.loc[876])                                
                                            .sort_index())

stocks_complete_df = stocks_complete_df.drop(index=876)

In [31]:
with pd.option_context("max_rows", None):
    display(stocks_incomplete_df)

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
8,ACE Convergence Acquisition Corp,ACEV,Finance,Financial Conglomerates,,,,
13,Affirm Holdings,AFRM,Technology Services,Internet Software/Services,,,,
16,Agilon Health,AGL,Health Services,Medical/Nursing Services,,,,
17,AGNC Investment,AGNC,Finance,Real Estate Investment Trusts,,,,
22,Ajax I,AJAX,Finance,Financial Conglomerates,,,,
26,Alexion Pharmaceuticals,ALXN,Health Technology,Biotechnology,,,,
29,Allot Communications,ALLT,Electronic Technology,Computer Communications,,,,
41,Ameresco,AMRC,Industrial Services,Engineering & Construction,,,,
49,Americold Realty Trust,COLD,Finance,Real Estate Investment Trusts,,,,
52,Amicus Therapeutics,FOLD,Health Technology,Biotechnology,,,,


In [32]:
stocks_incomplete_df[stocks_incomplete_df.ticker.isin(stocks_in_sample) & ~info_captured]

  stocks_incomplete_df[stocks_incomplete_df.ticker.isin(stocks_in_sample) & ~info_captured]


Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
463,iQIYI,IQ,Technology Services,Internet Software/Services,,,,
649,Petroleo Brasileiro S.A.,PBR,Energy Minerals,Integrated Oil,,,,
738,SoFi Technologies,SOFI,Finance,Finance/Rental/Leasing,,,,


In [33]:
QUERY = Template("""
SELECT *  
WHERE { ?endpoint dbp:tradedAs "NASDAQ:$ticker"@en;
                  dbp:name ?company_name;
                  dbo:wikiPageID ?wiki_page_id
        OPTIONAL {
            ?endpoint rdfs:comment ?comment 
            FILTER (lang(?comment) = "en")
        }
        FILTER (lang(?company_name ) = "en")        
      } LIMIT 1
""")

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

_stocks = stocks_incomplete_df.to_dict(orient="index")

for i, stock in tqdm_notebook(_stocks.items()):
    sparql.setQuery(QUERY.substitute(ticker=stock["ticker"]))
    response = sparql.query().convert()
    vars_ = response["head"]["vars"]
    bindings = response["results"]["bindings"]
    if bindings:
        binding = bindings[0]
        for var in vars_:
            stock[var] = binding[var]["value"] if var in binding else None

stocks_incomplete_df = pd.DataFrame.from_dict(_stocks, orient="index")

  0%|          | 0/245 [00:00<?, ?it/s]

In [34]:
info_captured = mask(stocks_incomplete_df)

with pd.option_context("max_rows", None):
    display(stocks_incomplete_df[info_captured].style.format({'endpoint': make_clickable}))

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
13,Affirm Holdings,AFRM,Technology Services,Internet Software/Services,http://dbpedia.org/resource/Affirm_(company),Affirm,60320967,"Affirm is a publicly traded financial technology company headquartered in San Francisco, United States. Founded in 2012, the company operates as a financial lender of installment loans for consumers to use at the point of sale to finance a purchase."
26,Alexion Pharmaceuticals,ALXN,Health Technology,Biotechnology,http://dbpedia.org/resource/Alexion_Pharmaceuticals,Alexion Pharmaceuticals,31443674,"Alexion Pharmaceuticals, a subsidiary of AstraZeneca, is an American pharmaceutical company headquartered in Boston, Massachusetts that specializes in orphan drugs to treat rare diseases. Its products include eculizumab (Soliris) with $4.064 billion in 2020 revenues and ravulizumab (Ultomiris) with $1.076 billion in 2020 revenues, both used to treat the rare disorders of atypical hemolytic uremic syndrome (aHUS) and paroxysmal nocturnal hemoglobinuria (PNH); asfotase alfa (Strensiq) with $731 million in 2020 revenues, used to treat hypophosphatasia; sebelipase alfa (Kanuma) with $117 million in 2020 revenues, used to treat lysosomal acid lipase deficiency, and andexanet alfa (Andexxa) with $78 million in 2020 revenues, used to stop life threatening or uncontrollable bleeding in people who"
52,Amicus Therapeutics,FOLD,Health Technology,Biotechnology,http://dbpedia.org/resource/Amicus_Therapeutics,"Amicus Therapeutics, Inc.",44432921,"Amicus Therapeutics is a public American biopharmaceutical company based in Philadelphia, PA. The company went public in 2007 under the NASDAQ trading symbol FOLD. This followed a 2006 planned offering and subsequent withdrawal, which would have established the trading symbol as AMTX Prior to their IPO, Amicus was funded by a variety of venture capital firms including Radius Ventures, Canaan Partners and New Enterprise Associates."
57,ANGI Homeservices,ANGI,Technology Services,Internet Software/Services,http://dbpedia.org/resource/ANGI_Homeservices,Angi Inc.,4041255,"Angi Inc. is an internet services company formed in 2017 by the merger of Angie's List and HomeAdvisor. The company has its earliest roots in American home services website Angie's List, founded in 1995 as an online directory that allows users to read and publish crowd-sourced reviews of local businesses and contractors. In March 2019, Angi moved its corporate headquarters to Denver, Colorado."
69,Appian Corporation,APPN,Technology Services,Packaged Software,http://dbpedia.org/resource/Appian_Corporation,Appian Corporation,54247317,"Appian Corporation is a cloud computing and enterprise software company headquartered in McLean, Virginia, part of the Dulles Technology Corridor. The company sells a platform as a service (PaaS) for building enterprise software applications. It is focused on low-code development, business process management, and case management markets."
77,Arcturus Therapeutics Holdings Inc,ARCT,Health Technology,Pharmaceuticals: Major,http://dbpedia.org/resource/Arcturus_Therapeutics,Arcturus Therapeutics,48080633,"Arcturus Therapeutics is an American RNA medicines biotechnology company focused on the discovery, development and commercialization of therapeutics for rare diseases and infectious diseases. Arcturus has developed a novel, potent, and safe RNA therapeutics platform called LUNAR, a proprietary lipid-enabled delivery system for nucleic acid medicines including small interfering RNA (siRNA), messenger RNA (mRNA), gene editing RNA, DNA, antisense oligonucleotides (ASO), and microRNA."
81,Arrival Group,ARVL,Consumer Durables,Motor Vehicles,http://dbpedia.org/resource/Arrival_(company),Arrival Ltd,52345762,"Arrival Ltd is a British-American electric vehicle manufacturer headquartered in London, UK, of primarily lightweight commercial vehicles. In June 2020, Arrival announced a new passenger bus designed for coronavirus-era social distancing. R&D takes place at their facility in Banbury, Oxfordshire. In March 2020, Arrival acquired a new factory in Bicester with plans to be operational by 2021 and start production in 2022. In December 2020, Arrival established its North American headquarters in Charlotte, NC, USA."
84,AstraZeneca PLC,AZN,Health Technology,Pharmaceuticals: Major,http://dbpedia.org/resource/AstraZeneca,AstraZeneca plc,441872,"AstraZeneca plc (/ˌæstrəˈzɛnəkə/) is a British-Swedish multinational pharmaceutical and biotechnology company with its headquarters at the Cambridge Biomedical Campus in Cambridge, England. It has a portfolio of products for major diseases in areas including oncology, cardiovascular, gastrointestinal, infection, neuroscience, respiratory, and inflammation. It is perhaps best known for its involvement in developing the Oxford-AstraZeneca COVID-19 vaccine."
156,Cadence Design Systems,CDNS,Technology Services,Packaged Software,http://dbpedia.org/resource/Cadence_Design_Systems,"Cadence Design Systems, Inc.",11678999,"Cadence Design Systems, Inc. (stylized as cādence), headquartered in San Jose, California, is an American multinational computational software company, founded in 1988 by the merger of SDA Systems and ECAD, Inc. The company produces software, hardware and silicon structures for designing integrated circuits, systems on chips (SoCs) and printed circuit boards."
161,Canoo,GOEV,Consumer Durables,Motor Vehicles,http://dbpedia.org/resource/Canoo,Canoo Inc.,65364144,"Canoo is a startup American manufacturer of electric vehicles. The company plans to sell a minivan in the year 2022. The company's also plans to produce commercial electric vehicles such as vans for vehicle rental and ride sharing services. The company is located in Torrance, California, near Los Angeles, California."


In [35]:
print(len(stocks_incomplete_df[info_captured]), len(stocks_incomplete_df[~info_captured]))

62 183


In [36]:
# Minor corrections

stocks_incomplete_df.loc[502, "endpoint"] = "https://dbpedia.org/resource/Lattice_Semiconductor"
stocks_incomplete_df.loc[502, "company_name"] = "Lattice Semiconductor Corporation"
stocks_incomplete_df.loc[502, "wiki_page_id"] = 1081229
stocks_incomplete_df.loc[502, "comment"] = None

stocks_incomplete_df.loc[532, "endpoint"] = "https://dbpedia.org/resource/Marvell_Technology,_Inc."
stocks_incomplete_df.loc[532, "wiki_page_id"] = 5276522
stocks_incomplete_df.loc[532, "company_name"] = "Marvell Technology, Inc."
stocks_incomplete_df.loc[532, "comment"] = None

stocks_incomplete_df.loc[680, "company_name"] = "Qurate Retail, Inc."

In [37]:
stocks_complete_df = pd.concat([stocks_complete_df, 
                                stocks_incomplete_df[info_captured]]) \
                         .sort_index()

stocks_incomplete_df = stocks_incomplete_df[~info_captured]

In [38]:
with pd.option_context("max_rows", None):
    display(stocks_incomplete_df)

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
8,ACE Convergence Acquisition Corp,ACEV,Finance,Financial Conglomerates,,,,
16,Agilon Health,AGL,Health Services,Medical/Nursing Services,,,,
17,AGNC Investment,AGNC,Finance,Real Estate Investment Trusts,,,,
22,Ajax I,AJAX,Finance,Financial Conglomerates,,,,
29,Allot Communications,ALLT,Electronic Technology,Computer Communications,,,,
41,Ameresco,AMRC,Industrial Services,Engineering & Construction,,,,
49,Americold Realty Trust,COLD,Finance,Real Estate Investment Trusts,,,,
53,Amneal Pharmaceuticals,AMRX,Health Technology,Pharmaceuticals: Major,,,,
62,Antero Midstream Partners LP,AM,Industrial Services,Oil & Gas Pipelines,,,,
67,Apollo Commercial Real Estate Finance,ARI,Finance,Real Estate Investment Trusts,,,,


In [39]:
stocks_incomplete_df[stocks_incomplete_df.ticker.isin(stocks_in_sample) & ~info_captured]

  stocks_incomplete_df[stocks_incomplete_df.ticker.isin(stocks_in_sample) & ~info_captured]


Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
649,Petroleo Brasileiro S.A.,PBR,Energy Minerals,Integrated Oil,,,,


In [40]:
QUERY = Template("""
SELECT *  
WHERE {
       ?endpoint a dbo:Company ;
                rdfs:label     ?label ;
                dbp:name       ?company_name ;
                dbo:wikiPageID ?wiki_page_id 
       OPTIONAL {
            ?endpoint rdfs:comment ?comment 
            FILTER (lang(?comment) = "en")
       }
       FILTER (regex(?label, "^$pattern", "i") && (lang(?company_name ) = "en"))  
                    
} LIMIT 1
""")

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

_stocks = stocks_incomplete_df.to_dict(orient="index")

for i, stock in tqdm_notebook(_stocks.items()):
    sparql.setQuery(QUERY.substitute(pattern=stock["name"])) 
    response = sparql.query().convert()
    bindings = response["results"]["bindings"]
    if bindings:
        print(stock["ticker"], len(bindings))
        binding = bindings[0]
        for var in ("endpoint", "company_name", "wiki_page_id", "comment"):
            stock[var] = binding[var]["value"] if var in binding else None
                
stocks_incomplete_df = pd.DataFrame.from_dict(_stocks, orient="index")

  0%|          | 0/183 [00:00<?, ?it/s]

ATR 1
AXTA 1
BLDP 1
BSBR 1
BRK.B 1
CSIQ 1
CARR 1
CHKAQ 1
CHL 1
COIN 1
BVN 1
CPNG 1
BAP 1
DISCK 1
DOCU 1
EPAM 1
FLT 1
HEI 1
HUN 1
IBN 1
JBHT 1
JKHY 1
JAZZ 1
LB 1
LVS 1
LPL 1
MXIM 1
MDB 1
MORN 1
NBIX 1
ODP 1
OKTA 1
OGN 1
PTEN 1
PBF 1
QLYS 1
RXT 1
RES 1
SBH 1
SEB 1
SQSP 1
SU 1
RUN 1
TRGP 1
TER 1
TIMB 1
UAA 1
VER 1
ZEN 1


In [41]:
info_captured = mask(stocks_incomplete_df)

with pd.option_context("max_rows", None):
    display(stocks_incomplete_df[info_captured].style.format({'endpoint': make_clickable}))

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
73,AptarGroup,ATR,Process Industries,Containers/Packaging,http://dbpedia.org/resource/AptarGroup,"AptarGroup, Inc.",61519578,"AptarGroup, Inc., also known as Aptar, is a United States-based global manufacturer of consumer dispensing packaging and drug delivery devices. The group has manufacturing operations in 18 countries."
98,Axalta Coating Systems,AXTA,Process Industries,Industrial Specialties,http://dbpedia.org/resource/Axalta,Axalta Coating Systems Ltd.,41274029,"Axalta Coating Systems, Ltd., also known as simply Axalta, is an American company specializing in coatings in a wide variety of industrial applications, materials and sectors, such as for example automotive paints. The company is based in Philadelphia, PA, and incorporated in Bermuda. Axalta develops and manufactures coatings for light and commercial vehicles, industrial, and refinish applications. The firm does business in 130 countries, has nearly 13,000 employees, and has more than 100,000 customers."
104,Ballard Power Systems,BLDP,Producer Manufacturing,Electrical Products,http://dbpedia.org/resource/Ballard_Power_Systems,Ballard Power Systems Inc.,233422,"Ballard Power Systems Inc. is a developer and manufacturer of proton exchange membrane (PEM) fuel cell products for markets such as heavy-duty motive (consisting of bus and tram applications), portable power, material handling as well as engineering services. Ballard has designed and shipped over 400 MW of fuel cell products to date."
107,Banco Santander (Brasil),BSBR,Finance,Regional Banks,http://dbpedia.org/resource/Santander_Brasil,Banco Santander S.A.,24232108,"Banco Santander (Brasil) S.A. is the Brazilian subsidiary of the Spanish Santander Group, headquartered in São Paulo, Brazil. It is the fifth largest banking institution in Brazil, as well as the fifth largest in Latin America, and the largest division of the group outside Europe, accounting for around 30% of its financial results globally by 2019. The bank is listed at the B3 in São Paulo, and at NYSE though ADRs."
117,Berkshire Hathaway,BRK.B,Finance,Multi-Line Insurance,http://dbpedia.org/resource/Berkshire_Hathaway_Assurance,Berkshire Hathaway Assurance,15031934,"Berkshire Hathaway Assurance is a bond insurance company created by Berkshire Hathaway, Inc. in December 2007."
158,Canadian Solar,CSIQ,Producer Manufacturing,Electrical Products,http://dbpedia.org/resource/Canadian_Solar,Canadian Solar Inc.,31697245,Canadian Solar Inc. is a publicly traded company that manufactures solar PV modules and runs large scale solar projects.
167,Carrier Global Corp,CARR,Producer Manufacturing,Industrial Machinery,http://dbpedia.org/resource/Carrier_Global,Carrier Global Corporation,2728694,"Carrier Global Corporation is an American multinational home appliances corporation based in Palm Beach Gardens, Florida. Carrier was founded in 1915 as an independent company manufacturing and distributing heating, ventilating and air conditioning (HVAC) systems, and has since expanded to include manufacturing commercial refrigeration and foodservice equipment, and fire and security technologies. As of 2020, it was an $18.6 billion company with over 53,000 employees serving customers in 160 countries on six continents."
188,Chesapeake Energy,CHKAQ,Energy Minerals,Integrated Oil,http://dbpedia.org/resource/Chesapeake_Energy,Chesapeake Energy Corporation,1942813,"Chesapeake Energy Corporation is an American energy company engaged in hydrocarbon exploration. It is headquartered in Oklahoma City. The company is named after the founder's love for the Chesapeake Bay region. The company is ranked 373rd on the Fortune 500. According to a 2017 study, it was the 90th most polluting company in the world, being responsible for 0.1% of global industrial greenhouse gas emissions from 1988 to 2015."
193,China Mobile,CHL,Communications,Telecommunications Equipment,http://dbpedia.org/resource/China_Mobile,China Mobile Limited,375479,"China Mobile is the trade name of both China Mobile Limited (Chinese: 中国移动有限公司; pinyin: Zhōngguó Yídòng Yǒu Xiàn Gōngsī) and its ultimate controlling shareholder, China Mobile Communications Group Co., Ltd. (Chinese: 中国移动通信集团有限公司; pinyin: Zhōngguó Yídòng Tōngxìn Jítuán Gōngsī, formerly known as China Mobile Communications Corporation, ""CMCC""), a Chinese state-owned company. China Mobile Limited provides mobile voice and multimedia services through its nationwide mobile telecommunications network across mainland China and Hong Kong."
219,Coinbase,COIN,Technology Services,Packaged Software,http://dbpedia.org/resource/Coinbase,"Coinbase Global, Inc.",39596725,"Coinbase Global, Inc., branded Coinbase, is an American company that operates a cryptocurrency exchange platform. Coinbase operates remote-first, and lacks an official physical headquarters. The company was founded in 2012 by Brian Armstrong and Fred Ehrsam, and as of March 2021 was the largest cryptocurrency exchange in the United States by trading volume. On April 14, 2021, Coinbase went public on the Nasdaq exchange via a direct listing."


In [42]:
print(len(stocks_incomplete_df[info_captured]), len(stocks_incomplete_df[~info_captured]))

49 134


In [43]:
with pd.option_context("max_rows", None):
    display(stocks_incomplete_df.loc[[499, 566, 618]].style.format({'endpoint': make_clickable}))

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
499,L Brands,LB,Retail Trade,Apparel/Footwear Retail,"http://dbpedia.org/resource/Bath_&_Body_Works,_Inc.","Bath & Body Works, Inc.",557811,
566,Morningstar,MORN,Finance,Investment Managers,http://dbpedia.org/resource/Morningstar_Farms,Morningstar Farms,3157347,"Morningstar Farms (stylized as MorningStar Farms) is a division of the Kellogg Company that produces vegan and vegetarian food. Many of their offerings are plant-based variations of traditionally meat products, including some that are vegan. Their products include meatless chicken nuggets, popcorn chicken, corn dogs, breakfast sausage, burgers, hot dogs, bacon, and pizza snack rolls with vegan cheese. Some, but not all products are vegan; Morningstar Farms announced all products would be vegan by 2021."
618,Okta,OKTA,Technology Services,Packaged Software,http://dbpedia.org/resource/Cholesterol_side-chain_cleavage_enzyme,cholesterol monooxygenase,6796314,"Cholesterol side-chain cleavage enzyme is commonly referred to as P450scc, where ""scc"" is an acronym for side-chain cleavage. P450scc is a mitochondrial enzyme that catalyzes conversion of cholesterol to pregnenolone. This is the first reaction in the process of steroidogenesis in all mammalian tissues that specialize in the production of various steroid hormones. P450scc is a member of the cytochrome P450 superfamily of enzymes (family 11, subfamily A, polypeptide 1). The gene name is CYP11A1."


In [44]:
stocks_complete_df = pd.concat([stocks_complete_df, 
                                stocks_incomplete_df[info_captured]]) \
                         .sort_index()

stocks_incomplete_df = stocks_incomplete_df[~info_captured]

In [45]:
stocks_incomplete_df = (stocks_incomplete_df.append(stocks_complete_df.loc[[499, 566, 618]])                                
                                            .sort_index())

stocks_complete_df = stocks_complete_df.drop(index=[499, 566, 618])

In [46]:
stocks_incomplete_df.loc[[499, 566, 618]]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
499,L Brands,LB,Retail Trade,Apparel/Footwear Retail,"http://dbpedia.org/resource/Bath_&_Body_Works,...","Bath & Body Works, Inc.",557811,
566,Morningstar,MORN,Finance,Investment Managers,http://dbpedia.org/resource/Morningstar_Farms,Morningstar Farms,3157347,Morningstar Farms (stylized as MorningStar Far...
618,Okta,OKTA,Technology Services,Packaged Software,http://dbpedia.org/resource/Cholesterol_side-c...,cholesterol monooxygenase,6796314,Cholesterol side-chain cleavage enzyme is comm...


In [47]:
len(stocks_complete_df)

778

In [48]:
len(stocks_incomplete_df)

137

In [49]:
stocks_incomplete_df[stocks_incomplete_df.ticker.isin(stocks_in_sample)]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
649,Petroleo Brasileiro S.A.,PBR,Energy Minerals,Integrated Oil,,,,


In [50]:
stocks_complete_df[stocks_complete_df.ticker.isin(stocks_in_sample)]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
12,Advanced Micro Devices,AMD,Electronic Technology,Semiconductors,http://dbpedia.org/resource/Advanced_Micro_Dev...,"Advanced Micro Devices, Inc.",2400,"Advanced Micro Devices, Inc. (AMD) is an Ameri..."
25,Alcoa,AA,Non-Energy Minerals,Aluminum,http://dbpedia.org/resource/Alcoa,Alcoa Corporation,479944,Alcoa Corporation (a portmanteau of Aluminum C...
27,Alibaba,BABA,Retail Trade,Internet Retail,http://dbpedia.org/resource/Alibaba_Group,Alibaba Group Holding Limited,2430273,"Alibaba Group Holding Limited, also known as A..."
39,Ambev,ABEV,Consumer Non-Durables,Beverages: Alcoholic,http://dbpedia.org/resource/Ambev,Ambev S.A.,1303503,"Ambev, formally Companhia de Bebidas das Améri..."
40,AMC,AMC,Consumer Services,Movies/Entertainment,http://dbpedia.org/resource/AMC_Theatres,"AMC Entertainment Holdings, Inc.",563739,"AMC Entertainment Holdings, Inc. (d/b/a AMC Th..."
42,American Airlines,AAL,Transportation,Airlines,http://dbpedia.org/resource/American_Airlines_...,American Airlines Group Inc.,38521579,American Airlines Group Inc. is an American pu...
70,Apple,AAPL,Electronic Technology,Telecommunications Equipment,http://dbpedia.org/resource/Apple_Inc.,Apple Inc.,856,Apple Inc. is an American multinational techno...
86,AT&T,T,Communications,Major Telecommunications,http://dbpedia.org/resource/AT&T,AT&T Inc.,17555269,AT&T Inc. is an American multinational conglom...
105,Banco Bradesco S.A. – Preferred Shares,BBD,Finance,Major Banks,http://dbpedia.org/resource/Banco_Bradesco,Banco Bradesco S.A.,497670,Banco Bradesco S.A. is a Brazilian financial s...
108,Bank of America,BAC,Finance,Major Banks,http://dbpedia.org/resource/Bank_of_America,Bank of America,347756,The Bank of America Corporation (simply referr...


In [51]:
info_captured = mask(stocks_incomplete_df)

with pd.option_context("max_rows", None, "max_colwidth", 40):
    display(stocks_incomplete_df[info_captured])

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
499,L Brands,LB,Retail Trade,Apparel/Footwear Retail,http://dbpedia.org/resource/Bath_&_B...,"Bath & Body Works, Inc.",557811,
566,Morningstar,MORN,Finance,Investment Managers,http://dbpedia.org/resource/Mornings...,Morningstar Farms,3157347,Morningstar Farms (stylized as Morni...
618,Okta,OKTA,Technology Services,Packaged Software,http://dbpedia.org/resource/Choleste...,cholesterol monooxygenase,6796314,Cholesterol side-chain cleavage enzy...


In [52]:
len(stocks_complete_df)

778

In [53]:
# Re-setting dataframe
stocks_incomplete_df = stocks_df.iloc[list(set(stocks_df.index) 
                                               - set(stocks_complete_df.index))] \
                                .sort_index()

# Re-setting columns
for col in ["endpoint", "company_name", "wiki_page_id", "comment"]:
    stocks_incomplete_df[col] = None

print(len(stocks_incomplete_df))

with pd.option_context("max_rows", None, "max_colwidth", 40):
    display(stocks_incomplete_df)

137


Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
8,ACE Convergence Acquisition Corp,ACEV,Finance,Financial Conglomerates,,,,
16,Agilon Health,AGL,Health Services,Medical/Nursing Services,,,,
17,AGNC Investment,AGNC,Finance,Real Estate Investment Trusts,,,,
22,Ajax I,AJAX,Finance,Financial Conglomerates,,,,
29,Allot Communications,ALLT,Electronic Technology,Computer Communications,,,,
41,Ameresco,AMRC,Industrial Services,Engineering & Construction,,,,
49,Americold Realty Trust,COLD,Finance,Real Estate Investment Trusts,,,,
53,Amneal Pharmaceuticals,AMRX,Health Technology,Pharmaceuticals: Major,,,,
62,Antero Midstream Partners LP,AM,Industrial Services,Oil & Gas Pipelines,,,,
67,Apollo Commercial Real Estate Finance,ARI,Finance,Real Estate Investment Trusts,,,,


In [54]:
stocks_incomplete_df.loc[[401, 649]]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
401,Harmony Gold Mining Company Limited,HMY,Non-Energy Minerals,Precious Metals,,,,
649,Petroleo Brasileiro S.A.,PBR,Energy Minerals,Integrated Oil,,,,


In [55]:
# Manual additions

stocks_incomplete_df.loc[401, "endpoint"] = "https://dbpedia.org/resource/Harmony_Gold_USA"
stocks_incomplete_df.loc[401, "company_name"] = "Harmony Gold USA, Inc."
stocks_incomplete_df.loc[401, "wiki_page_id"] = 1426546
stocks_incomplete_df.loc[401, "comment"] = "Harmony Gold USA, Inc. is an American film and television production company. It was founded in 1983 by Egyptian-born Frank Agrama and is managed by his daughter, Jehan F. Agrama. In 1976, Agrama sold broadcast rights from Paramount Pictures to the Mediaset media conglomerate. It is best known as the distributor of the controversial Shaka Zulu miniseries and for various anime series, notably Robotech. In addition to its distribution and production interests, Harmony Gold manages several real estate properties in the Southern California area. They also operate a screening room in Los Angeles."

stocks_incomplete_df.loc[649, "endpoint"] = "https://dbpedia.org/resource/Petrobras"
stocks_incomplete_df.loc[649, "company_name"] = "Petróleo Brasileiro S.A. — Petrobras"
stocks_incomplete_df.loc[649, "wiki_page_id"] = 1764358
stocks_incomplete_df.loc[649, "comment"] = "Petróleo Brasileiro S.A., better known by the acronym Petrobras (Portuguese pronunciation: [ˌpɛtɾoˈbɾas ]), is a state-owned Brazilian multinational corporation in the petroleum industry headquartered in Rio de Janeiro, Brazil. The company's name translates to Brazilian Petroleum Corporation — Petrobras.The company was ranked #120 in the most recent Fortune Global 500 list. In the 2020 Forbes Global 2000, Petrobras was ranked as the 70th -largest public company in the world."

stocks_incomplete_df.loc[[401, 649]]

Unnamed: 0,name,ticker,sector,industry,endpoint,company_name,wiki_page_id,comment
401,Harmony Gold Mining Company Limited,HMY,Non-Energy Minerals,Precious Metals,https://dbpedia.org/resource/Harmony_Gold_USA,"Harmony Gold USA, Inc.",1426546,"Harmony Gold USA, Inc. is an American film and..."
649,Petroleo Brasileiro S.A.,PBR,Energy Minerals,Integrated Oil,https://dbpedia.org/resource/Petrobras,Petróleo Brasileiro S.A. — Petrobras,1764358,"Petróleo Brasileiro S.A., better known by the ..."


In [56]:
stocks_complete_df = (stocks_complete_df.append(stocks_incomplete_df.loc[[401, 649]])                                
                                        .sort_index())

stocks_incomplete_df = stocks_incomplete_df.drop(index=[401, 649])

print(len(stocks_complete_df))
print(len(stocks_incomplete_df))

780
135


In [57]:
col_mappings = {"name":"stock_name", "ticker": "ticker_symbol", "endpoint": "dbpedia_endpoint"}

stocks_complete_df = stocks_complete_df.rename(columns=col_mappings)
stocks_incomplete_df = stocks_incomplete_df.rename(columns=col_mappings)

In [58]:
stocks_complete_df

Unnamed: 0,stock_name,ticker_symbol,sector,industry,dbpedia_endpoint,company_name,wiki_page_id,comment
0,21st Century Fox,FOXA,Consumer Services,Broadcasting,http://dbpedia.org/resource/Fox_Corporation,Fox Corporation,58734970,Fox Corporation is an American mass media comp...
1,23andMe Holding Co,ME,Finance,Financial Conglomerates,http://dbpedia.org/resource/23andMe,"23andMe, Inc.",11979083,"23andMe, Inc. is a publicly held personal geno..."
2,2U,TWOU,Technology Services,Packaged Software,http://dbpedia.org/resource/2U_(company),"2U, Inc.",37371846,"2U, Inc. (formerly 2tor Inc.) is an American e..."
3,3M,MMM,Producer Manufacturing,Industrial Conglomerates,http://dbpedia.org/resource/3M,3M Company,7664801,The 3M Company is an American multinational co...
4,Abbott Labs,ABT,Health Technology,Medical Specialties,http://dbpedia.org/resource/Abbott_Laboratories,Abbott Laboratories,488730,Abbott Laboratories is an American multination...
...,...,...,...,...,...,...,...,...
909,ZipRecruiter,ZIP,Technology Services,Internet Software/Services,http://dbpedia.org/resource/ZipRecruiter,ZipRecruiter,59966275,ZipRecruiter is an American employment marketp...
910,Zoetis,ZTS,Health Technology,Pharmaceuticals: Generic,http://dbpedia.org/resource/Zoetis,Zoetis Inc.,38395838,Zoetis Inc. (/zō-EH-tis/) is an American drug ...
911,Zoom,ZM,Technology Services,Packaged Software,http://dbpedia.org/resource/Zoom_Video_Communi...,"Zoom Video Communications, Inc.",43358530,"Zoom Video Communications, Inc. (stylized as z..."
912,Zscaler,ZS,Technology Services,Packaged Software,http://dbpedia.org/resource/Zscaler,"Zscaler, Inc.",65138718,Zscaler (/ˈziːˌskeɪlər/) is an American cloud-...


In [59]:
stocks_complete_df.wiki_page_id = stocks_complete_df.wiki_page_id.astype(dtype=int)

In [60]:
stocks_complete_df.to_json("../../data/processed/stocks/revolut.2021-07-05.complete.jsonl", 
                           orient="records", lines=True)
stocks_incomplete_df.to_json("../../data/processed/stocks/revolut.2021-07-05.incomplete.jsonl", 
                           orient="records", lines=True)