# Importing needed libraries

In [1]:
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import date

In [26]:
# !pip install selenium

In [27]:
# For scraping
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from bs4 import BeautifulSoup
import requests
from selenium import webdriver

# Loading the datasets

In [3]:
df_industries = pd.read_csv('./Raw Data/industries.csv', delimiter= ';')
df_companies = pd.read_csv('./Raw Data/us-companies.csv', delimiter= ';')
df_balance = pd.read_csv('./Raw Data/us-balance-quarterly.csv', delimiter= ';')
df_cashflow = pd.read_csv('./Raw Data/us-cashflow-quarterly.csv', delimiter= ';')
# df_shareprices = pd.read_csv('./Raw Data/us-shareprices-daily.csv', delimiter= ';')

# EDA & Cleaning

# Industries

In [4]:
display(df_industries.head())
display(df_industries.isna().sum())
display(df_industries.shape)

Unnamed: 0,IndustryId,Sector,Industry
0,100001,Industrials,Industrial Products
1,100002,Industrials,Business Services
2,100003,Industrials,Engineering & Construction
3,100004,Industrials,Waste Management
4,100005,Industrials,Industrial Distribution


IndustryId    0
Sector        0
Industry      0
dtype: int64

(74, 3)

In [28]:
# df_industries['Industry'].unique()

In [29]:
# df_industries['Sector'].unique()

In [7]:
# Defining the industry and sector according to an ID, all data seems to be complete and clean

# Companies

## First look

In [8]:
display(df_companies.head(3))
display(df_companies.isna().sum())
display(df_companies.shape)

Unnamed: 0,Ticker,SimFinId,Company Name,IndustryId
0,A,45846,AGILENT TECHNOLOGIES INC,106001.0
1,A18,1253413,Trip.com Group Ltd,
2,A21,1333027,Li Auto Inc.,


Ticker            0
SimFinId          0
Company Name      0
IndustryId      416
dtype: int64

(3144, 4)

In [9]:
# It seems that for some companies we do not have the indusry data for a lot of companies, we investiate.
df_companies[df_companies['IndustryId'].isna()].head(5)

Unnamed: 0,Ticker,SimFinId,Company Name,IndustryId
1,A18,1253413,Trip.com Group Ltd,
2,A21,1333027,Li Auto Inc.,
21,ABEO,953884,ABEONA THERAPEUTICS INC.,
23,ABIO,953886,"ARCA biopharma, Inc.",
34,ABX,1271527,BARRICK GOLD CORP,


In [10]:
# Apparently the missing values are ranging from a very diverse set of industries.
# I try to assign some of the missing values by deductingh from the company names. -->

## Tokenize
To better adress the different companies by name, I tokenize them

In [11]:
# We tokenize the company names, so we can search through all appearing words
from nltk.tokenize import word_tokenize

In [12]:
df_companies['tokens'] = df_companies['Company Name'].apply(word_tokenize)

In [13]:
df_companies.head(5)

Unnamed: 0,Ticker,SimFinId,Company Name,IndustryId,tokens
0,A,45846,AGILENT TECHNOLOGIES INC,106001.0,"[AGILENT, TECHNOLOGIES, INC]"
1,A18,1253413,Trip.com Group Ltd,,"[Trip.com, Group, Ltd]"
2,A21,1333027,Li Auto Inc.,,"[Li, Auto, Inc, .]"
3,AA,367153,Alcoa Corp,110004.0,"[Alcoa, Corp]"
4,AAC_delist,939324,"AAC Holdings, Inc.",106011.0,"[AAC, Holdings, ,, Inc, .]"


## Functions for df_companies

In [14]:
# A function that returns one if one of our keywords is found
def finder(tokens):
    for i in tokens:
        if i.lower() in keywords:
            return 1
    return 0   

In [15]:
# A function, that displays the rows which contain keywords an have no industry ID 
# It also return a list of the corresponding indexes.

def nan_list(df,keywords):
    # Finding the companies with keywords
    df['current_search'] = 0
    df['current_search'] = df['tokens'].apply(finder)
    
    # Displaying them if they have no industrty-id
    display(df[ (df['current_search']==1) & (df['IndustryId'].isna()) ])
    
    # Saving the indexes
    indexlist = list(df[ (df['current_search']==1) & (df['IndustryId'].isna()) ].index)
    
    return indexlist

In [16]:
def set_industryID(industry, df, indexlist):
    
    # Getting the ID according to industry
    id = df_industries[df_industries['Industry']==industry]['IndustryId'].unique()[0]  
         
    # Setting the id for the specified indexes
    for row in indexlist:
        df.iloc[row,3] = id
    
    return df    

## ETF's & Funds

Since ETFs Funds and REITs do not usually issue debt notes or they have to be evaluated completely different, I will ecxlude them from this analysis.

In [17]:
keywords = ['etf','etc','fund','reit','trust']
df_companies['current_search'] = 0
df_companies['current_search'] = df_companies['tokens'].apply(finder)
# display(df_companies[df_companies['current_search']==1])
len(df_companies[df_companies['current_search']==1])

52

In [18]:
df_companies.shape

(3144, 6)

In [19]:
# We drop the according rows
df_companies = df_companies[df_companies['current_search'] == 0]
df_companies.shape

(3092, 6)

## Scraping the missing data
Apparently nasdaq loads the sites content ad hoc using java scripts and normal get request won't work, therfore we use selenium to mimic a real browser access

In [None]:
from selenium import webdriver

In [None]:
from selenium.webdriver.common.keys import Keys
PATH = 'C:\Program Files (x86)\chromedriver.exe'
l=list()
obj={}
target_url = "https://www.nasdaq.com/market-activity/stocks/tsla"
driver=webdriver.Chrome(PATH)
driver.get(target_url)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
time.sleep(2)
resp = driver.page_source
driver.close()

In [22]:
# Example ABX, Barrick Gold
example_ticker = 'AAPL'

In [23]:
# I will scrape the missing data from https://nasdaq.com
base_url = 'https://www.nasdaq.com/market-activity/stocks/' + example_ticker

In [24]:
response = requests.get(base_url)
response.status_code

ConnectionError: ('Connection aborted.', TimeoutError(10060, 'Ein Verbindungsversuch ist fehlgeschlagen, da die Gegenstelle nach einer bestimmten Zeitspanne nicht richtig reagiert hat, oder die hergestellte Verbindung war fehlerhaft, da der verbundene Host nicht reagiert hat', None, 10060, None))

#### Mining sector

In [None]:
# We have to define the keywords outside of the function, it would be unnessesary complicated, to
# pass them as an argument during the .apply method

keywords = ['gold','silver','iron','copper','uran','mining','resources','metal','minerals']

In [None]:
indexlist = nan_list(df_companies,keywords)

In [None]:
df_companies = set_industryID('Metals & Mining',df_companies,indexlist)

In [None]:
indexlist = nan_list(df_companies,keywords)
display(df_companies.isna().sum())

In [None]:
# We eliminated some NaNs

#### Biotech and pharma
We repeat the whoile process with different industries and keywords, we will not be able to fill an NaNs but maybe a good bit

In [None]:
keywords = ['bio','life','sciences','pharma','pharmaceuticals','research','therapeutics','biopharma',
           'bioscience','biosciences','therapeutics','diagnostics','biomedicine']
indexlist = nan_list(df_companies,keywords)

In [None]:
df_companies = set_industryID('Medical Diagnostics & Research',df_companies,indexlist)
display(df_companies.isna().sum())

#### Automobiles

In [None]:
keywords = ['car','mobile','auto']
indexlist = nan_list(df_companies,keywords)

In [None]:
df_companies = set_industryID('Autos',df_companies,indexlist)
display(df_companies.isna().sum())

#### Financials

In [None]:
keywords = ['bank','financial','investment','bancorp','equity']
indexlist = nan_list(df_companies,keywords)

In [None]:
df_companies = set_industryID('Banks',df_companies,indexlist)
display(df_companies.isna().sum())

#### Oil & Gas

In [None]:
keywords = ['oil','petroleum','fuel','gas','lng']
indexlist = nan_list(df_companies,keywords)

In [None]:
df_companies = set_industryID('Oil & Gas - Midstream',df_companies,indexlist)
display(df_companies.isna().sum())

#### Energy

In [None]:
keywords = ['energy','electric','power']
indexlist = nan_list(df_companies,keywords)

In [None]:
df_companies = set_industryID('Utilities - Regulated',df_companies,indexlist)
display(df_companies.isna().sum())

#### REITs

In [None]:

indexlist = nan_list(df_companies,keywords)

In [None]:
df_companies = set_industryID('REITs',df_companies,indexlist)
display(df_companies.isna().sum())