# Importing needed libraries

In [1]:
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import date

In [2]:
# !pip install selenium

In [3]:
# For scraping
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from bs4 import BeautifulSoup
import requests
from selenium import webdriver

# Loading the datasets

In [4]:
df_industries = pd.read_csv('./Raw Data/industries.csv', delimiter= ';')
df_companies = pd.read_csv('./Raw Data/us-companies.csv', delimiter= ';')
df_balance = pd.read_csv('./Raw Data/us-balance-quarterly.csv', delimiter= ';')
df_cashflow = pd.read_csv('./Raw Data/us-cashflow-quarterly.csv', delimiter= ';')
# df_shareprices = pd.read_csv('./Raw Data/us-shareprices-daily.csv', delimiter= ';')

# EDA & Cleaning

# Industries

In [5]:
display(df_industries.head())
display(df_industries.isna().sum())
display(df_industries.shape)

Unnamed: 0,IndustryId,Sector,Industry
0,100001,Industrials,Industrial Products
1,100002,Industrials,Business Services
2,100003,Industrials,Engineering & Construction
3,100004,Industrials,Waste Management
4,100005,Industrials,Industrial Distribution


IndustryId    0
Sector        0
Industry      0
dtype: int64

(74, 3)

Defining the industry and sector according to an ID, all data seems to be complete and clean

# Companies

## First look

In [6]:
display(df_companies.head(3))
display(df_companies.isna().sum())
display(df_companies.shape)

Unnamed: 0,Ticker,SimFinId,Company Name,IndustryId
0,A,45846,AGILENT TECHNOLOGIES INC,106001.0
1,A18,1253413,Trip.com Group Ltd,
2,A21,1333027,Li Auto Inc.,


Ticker            0
SimFinId          0
Company Name      0
IndustryId      416
dtype: int64

(3144, 4)

## There is data missing for the industry ID's

In [7]:
# It seems that for some companies we do not have the indusry data for a lot of companies, we investiate.
df_companies[df_companies['IndustryId'].isna()].head(5)

Unnamed: 0,Ticker,SimFinId,Company Name,IndustryId
1,A18,1253413,Trip.com Group Ltd,
2,A21,1333027,Li Auto Inc.,
21,ABEO,953884,ABEONA THERAPEUTICS INC.,
23,ABIO,953886,"ARCA biopharma, Inc.",
34,ABX,1271527,BARRICK GOLD CORP,


Apparently the missing values are ranging from a very diverse set of industries.
I try to assign some of the missing values by deductingh from the company names.

## Tokenize
To better adress the different companies by name, I tokenize them

In [8]:
# We tokenize the company names, so we can search through all appearing words
from nltk.tokenize import word_tokenize

In [9]:
df_companies['tokens'] = df_companies['Company Name'].apply(word_tokenize)

In [10]:
df_companies.head(5)

Unnamed: 0,Ticker,SimFinId,Company Name,IndustryId,tokens
0,A,45846,AGILENT TECHNOLOGIES INC,106001.0,"[AGILENT, TECHNOLOGIES, INC]"
1,A18,1253413,Trip.com Group Ltd,,"[Trip.com, Group, Ltd]"
2,A21,1333027,Li Auto Inc.,,"[Li, Auto, Inc, .]"
3,AA,367153,Alcoa Corp,110004.0,"[Alcoa, Corp]"
4,AAC_delist,939324,"AAC Holdings, Inc.",106011.0,"[AAC, Holdings, ,, Inc, .]"


## Functions for df_companies

In [11]:
# A function that returns one if one of our keywords is found
def finder(tokens):
    for i in tokens:
        if i.lower() in keywords:
            return 1
    return 0   

In [12]:
# A function, that displays the rows which contain keywords an have no industry ID 
# It also return a list of the corresponding indexes.

def nan_list(df,keywords):
    # Finding the companies with keywords
    df['current_search'] = 0
    df['current_search'] = df['tokens'].apply(finder)
    
    # Displaying them if they have no industrty-id
    display(df[ (df['current_search']==1) & (df['IndustryId'].isna()) ])
    
    # Saving the indexes
    indexlist = list(df[ (df['current_search']==1) & (df['IndustryId'].isna()) ].index)
    
    return indexlist

In [13]:
def set_industryID(industry, df, indexlist):
    
    # Getting the ID according to industry
    id = df_industries[df_industries['Industry']==industry]['IndustryId'].unique()[0]  
         
    # Setting the id for the specified indexes
    for row in indexlist:
        df.iloc[row,3] = id
    
    return df    

## ETF's & Funds

Since ETFs Funds and REITs do not usually issue debt notes or they have to be evaluated completely different, I will ecxlude them from this analysis.

In [14]:
keywords = ['etf','etc','fund','reit','trust']
df_companies['current_search'] = 0
df_companies['current_search'] = df_companies['tokens'].apply(finder)
# display(df_companies[df_companies['current_search']==1])
len(df_companies[df_companies['current_search']==1])

52

In [15]:
droppers1 = list(df_companies[df_companies['current_search']==1]['Ticker'])

In [16]:
df_companies.shape

(3144, 6)

In [17]:
# We drop the according rows
df_companies = df_companies[df_companies['current_search'] == 0]
df_companies.shape

(3092, 6)

## Scraping the missing data
Apparently nasdaq loads the sites content ad hoc using java scripts and normal get request won't work, therfore we use selenium to mimic a real browser access

In [18]:
# Importing the webdriver
from selenium import webdriver
from selenium.webdriver.common.by import By

In [19]:
# To wait until the driver finds the expected element, since it takes some time to load
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

In [20]:
# Defning the path of the chromedriver
path = 'D:/Chromium/chromedriver'

In [21]:
# Setting up driver
# driver = webdriver.Chrome(path)

In [22]:
# Telling the driver to wait for up to 20 seconds if element is not found
# wait = WebDriverWait(driver, 20)

#### Create alist of company tickers with missing values

In [23]:
ticker_list = list(df_companies[df_companies['IndustryId'].isna()]['Ticker'])
len(ticker_list)

394

In [24]:
# Testing with wait function
# ticker = 'DOW'
# target_url = f"https://finance.yahoo.com/quote/%s/profile?p=%s" % (ticker,ticker)
# target_url

In [25]:
# driver.get(target_url)
# element = wait.until(EC.presence_of_element_located(('xpath', '//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[4]')))
# element.text

#### Scraping the missing data

Takes quite some time, so I saved the results as a pickle

In [26]:
# sectors = []
# for ticker in ticker_list:
#     # Setting url
#     target_url = f"https://finance.yahoo.com/quote/%s/profile?p=%s" % (ticker,ticker)
#     # Accessing the site
#     driver.get(target_url)    

#     # Getting the desired field:
#     try:
#         match = wait.until(EC.presence_of_element_located(('xpath', '//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[4]')))
#         sectors.append(match.text)
#         print(match.text)
#     except:
#         sectors.append('Not found')
#         print('Not found')
#     print(len(sectors))

# # Closing window
# driver.quit()

In [27]:
# # Saving Sectorlist
import pickle
# pickle.dump(sectors, open('sectors.p', 'wb'))

In [28]:
sectors = pickle.load(open('sectors.p','rb'))
# sectors

#### Assigning the scraped data to the company dataframe

In [29]:
# We translate the industry names to industry ID's
sectors_id = []
for i in range(len(sectors)):
    try:
        sectors_id.append(int(df_industries[df_industries['Industry'] == sectors[i]]['IndustryId']))
    except:
        # If the industry is not found, we append the Industry ID for 'Other'
        sectors_id.append('108004')

In [30]:
# We assign the industry names to a new column
df_companies.loc[df_companies['IndustryId'].isna(), ['IndustryId']] = sectors_id

In [31]:
# We got rid of all NaN's
df_companies['IndustryId'].isna().sum()

0

In [32]:
# We save the cleaned dataframe to our processed Data
df_companies.to_csv('Prepared Frames/company_data.csv', index=False)

#### Just keeping the important columns


In [33]:
df_companies = df_companies.drop(['SimFinId','tokens','current_search'], axis= 1)

# Balance sheets

In [34]:
df_balance = pd.read_csv('./Raw Data/us-balance-quarterly.csv', delimiter= ';')

In [35]:
display(df_balance.head(1))

Unnamed: 0,Ticker,SimFinId,Currency,Fiscal Year,Fiscal Period,Report Date,Publish Date,Restated Date,Shares (Basic),Shares (Diluted),...,Short Term Debt,Total Current Liabilities,Long Term Debt,Total Noncurrent Liabilities,Total Liabilities,Share Capital & Additional Paid-In Capital,Treasury Stock,Retained Earnings,Total Equity,Total Liabilities & Equity
0,A,45846,USD,2017,Q1,2017-01-31,2017-03-08,2017-03-08,322000000.0,326000000.0,...,190000000.0,1089000000,1802000000.0,2483000000.0,3572000000,5239000000.0,0.0,-453000000.0,4300000000.0,7872000000


## Dropping unneccessary columns

In [36]:
# We can already see, that there are several column that we won't need, so we drop them first:
# SimFinID : A site specific identifier
# Currency : We have everything in USD
# We will only need the Report Date if the dates
# The share count we decide to only used the Diluted since it better represents reality
# Treasury Stock is insignificant
# Total Liabilities and equity doesn't give us any information, that we don't have otheriwse
# Total Current Assets and Total assets are redundant

In [37]:
df_balance = df_balance.drop(['SimFinId','Currency','Publish Date','Restated Date','Shares (Basic)','Treasury Stock','Total Liabilities & Equity'], axis = 1)
df_balance.head(1)

Unnamed: 0,Ticker,Fiscal Year,Fiscal Period,Report Date,Shares (Diluted),"Cash, Cash Equivalents & Short Term Investments",Accounts & Notes Receivable,Inventories,Total Current Assets,"Property, Plant & Equipment, Net",...,Total Assets,Payables & Accruals,Short Term Debt,Total Current Liabilities,Long Term Debt,Total Noncurrent Liabilities,Total Liabilities,Share Capital & Additional Paid-In Capital,Retained Earnings,Total Equity
0,A,2017,Q1,2017-01-31,326000000.0,2241000000.0,653000000.0,551000000.0,3635000000.0,653000000.0,...,7872000000,268000000.0,190000000.0,1089000000,1802000000.0,2483000000.0,3572000000,5239000000.0,-453000000.0,4300000000.0


## Dropping funds and ETF's
Since we earlier decided to exclude funds and ETF's we will now exclud ethem here as well.

In [38]:
for ticker in droppers1:
    df_balance = df_balance[df_cashflow['Ticker'] != ticker]

  df_balance = df_balance[df_cashflow['Ticker'] != ticker]


## Checking for NaN's

In [39]:
df_balance.isna().sum()

Ticker                                                 0
Fiscal Year                                            0
Fiscal Period                                          0
Report Date                                            0
Shares (Diluted)                                     297
Cash, Cash Equivalents & Short Term Investments      143
Accounts & Notes Receivable                         4532
Inventories                                        14874
Total Current Assets                                   2
Property, Plant & Equipment, Net                     897
Long Term Investments & Receivables                31478
Other Long Term Assets                               441
Total Noncurrent Assets                               91
Total Assets                                           0
Payables & Accruals                                  146
Short Term Debt                                    16049
Total Current Liabilities                              0
Long Term Debt                 

We will have to check column by column

In [40]:
df_balance[df_balance['Shares (Diluted)'].isna()]['Ticker'].unique()

array(['AGI', 'ALTMS', 'AMHCU', 'APU', 'ARPO', 'AUDH', 'BCAB', 'BKH',
       'BLUD', 'BPL', 'CCFI', 'CCH', 'CERS', 'CHAQ', 'CK00015503',
       'CK00015844', 'CK00015847', 'CNNX', 'DWACU', 'ENBL', 'EQ', 'FLUG',
       'FSII', 'IIVI', 'INFOR', 'NKT.CO', 'PROG', 'QVC', 'SFY', 'SRNGU',
       'TIVC', 'TM-28', 'VPCC', 'VTYX'], dtype=object)

In [41]:
# We see that the NaN's without share counts are split up into just 36 companies, we have a look, whart they are:
companylist = list(df_balance[df_balance['Shares (Diluted)'].isna()]['Ticker'].unique())

In [42]:
namelist = []
for company in companylist:
    try:
        namelist.append(str(df_companies[df_companies['Ticker']==company]['Company Name']))
    except:
        namlist.append(str(company)+' not found in companyframe')
namelist

# My guess, that we here have Funds that we won't include in our analysis did not hold.

['95    Affinion Group, Inc.\nName: Company Name, dtype: object',
 '158    Alta Mesa Holdings, LP\nName: Company Name, dtype: object',
 '174    Jasper Therapeutics, Inc.\nName: Company Name, dtype: object',
 '222    AMERIGAS PARTNERS LP\nName: Company Name, dtype: object',
 '245    Aadi Bioscience, Inc.\nName: Company Name, dtype: object',
 '276    AURORA DIAGNOSTICS HOLDINGS LLC\nName: Company Name, dtype: object',
 '334    BioAtla, Inc.\nName: Company Name, dtype: object',
 '386    BLACK HILLS POWER INC\nName: Company Name, dtype: object',
 '403    IMMUCOR INC\nName: Company Name, dtype: object',
 '424    BUCKEYE PARTNERS, L.P.\nName: Company Name, dtype: object',
 '520    Community Choice Financial Inc.\nName: Company Name, dtype: object',
 '521    Cheniere Corpus Christi Holdings, LLC\nName: Company Name, dtype: object',
 '554    CERUS CORP\nName: Company Name, dtype: object',
 '572    Renovacor, Inc.\nName: Company Name, dtype: object',
 '608    BakerCorp International, Inc.\nName

In [43]:
# We check if all values are NAN for the share count
for company in companylist:
    print(len(df_balance[(df_balance['Ticker'] == company) & (df_balance['Shares (Diluted)'].isna())]) == len(df_balance[(df_balance['Ticker'] == company)]))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [44]:
# Since we have no information about the sharecount for these companies and there are a lot of other values missing for them,
# I decide to drop them since they are just 36 out of over 3000
print(df_balance.shape)
for company in companylist:
    df_balance = df_balance[~(df_balance['Ticker']==company)]
df_balance.shape

(42145, 23)


(41848, 23)

## Systematic filling with mean value

Since we have many columns with a lot of missing valkues, wich make up more than 30% of our dataframe, and filling it with arbitrary values would cause a lot of error, we will drop them for now.

In [45]:
df_balance = df_balance.drop(['Inventories','Long Term Investments & Receivables','Short Term Debt','Long Term Debt'],axis = 1)
df_balance.shape

(41848, 19)

In [46]:
df_balance.isna().sum()

Ticker                                                0
Fiscal Year                                           0
Fiscal Period                                         0
Report Date                                           0
Shares (Diluted)                                      0
Cash, Cash Equivalents & Short Term Investments     136
Accounts & Notes Receivable                        4456
Total Current Assets                                  2
Property, Plant & Equipment, Net                    858
Other Long Term Assets                              418
Total Noncurrent Assets                              89
Total Assets                                          0
Payables & Accruals                                 126
Total Current Liabilities                             0
Total Noncurrent Liabilities                        640
Total Liabilities                                     0
Share Capital & Additional Paid-In Capital          584
Retained Earnings                               

In [47]:
# The other columns, will fill the NaN's with the mean value for the corresponding company
def get_mean(company,column):
    return np.mean(df_balance[ (df_balance['Ticker']==company) & (df_balance[column].isna() == False) ][column])

In [48]:
# Columns to iterate thorugh
columns = ['Cash, Cash Equivalents & Short Term Investments','Accounts & Notes Receivable','Total Current Assets','Property, Plant & Equipment, Net',
          'Other Long Term Assets','Total Noncurrent Assets','Payables & Accruals','Total Noncurrent Liabilities','Share Capital & Additional Paid-In Capital',
          'Retained Earnings','Total Equity']

# Getting a list with all company tickers
companylist = list(df_balance['Ticker'].unique())

In [49]:
# setting mean for missing NaN
for column in columns:
    print(column)
    for company in companylist:
        df_balance.loc[ (df_balance[column].isna()) & (df_balance['Ticker']==company), [column]] = get_mean(company,column)

Cash, Cash Equivalents & Short Term Investments
Accounts & Notes Receivable
Total Current Assets
Property, Plant & Equipment, Net
Other Long Term Assets
Total Noncurrent Assets
Payables & Accruals
Total Noncurrent Liabilities
Share Capital & Additional Paid-In Capital
Retained Earnings
Total Equity


In [50]:
df_balance.isna().sum()

Ticker                                                0
Fiscal Year                                           0
Fiscal Period                                         0
Report Date                                           0
Shares (Diluted)                                      0
Cash, Cash Equivalents & Short Term Investments       4
Accounts & Notes Receivable                        2963
Total Current Assets                                  2
Property, Plant & Equipment, Net                    477
Other Long Term Assets                               54
Total Noncurrent Assets                              24
Total Assets                                          0
Payables & Accruals                                  60
Total Current Liabilities                             0
Total Noncurrent Liabilities                        111
Total Liabilities                                     0
Share Capital & Additional Paid-In Capital            8
Retained Earnings                               

## Dropping companies with not enough data

In [51]:
# Apparently there are coulumns where no mean could be calculated. We see wich comnpanies are affected.
affected = []
for column in columns:
    affected.append(df_balance[df_balance[column].isna()]['Ticker'].unique())
# Making a list of the single entries instead of an array
droppers2 = []
for i in range(len(affected)):
    for x in affected[i]:
        droppers2.append(x)
# Removing duplicates
droppers2 = list(set(droppers2))
len(droppers2)

306

There are 336 companies affected for which we cannot get a value. We will have to drop them for now.

In [52]:
for ticker in droppers2:
    df_balance = df_balance[df_balance['Ticker'] != ticker]

In [53]:
df_balance.shape

(37617, 19)

In [54]:
df_balance.isna().sum()

Ticker                                             0
Fiscal Year                                        0
Fiscal Period                                      0
Report Date                                        0
Shares (Diluted)                                   0
Cash, Cash Equivalents & Short Term Investments    0
Accounts & Notes Receivable                        0
Total Current Assets                               0
Property, Plant & Equipment, Net                   0
Other Long Term Assets                             0
Total Noncurrent Assets                            0
Total Assets                                       0
Payables & Accruals                                0
Total Current Liabilities                          0
Total Noncurrent Liabilities                       0
Total Liabilities                                  0
Share Capital & Additional Paid-In Capital         0
Retained Earnings                                  0
Total Equity                                  

In [55]:
# Saving:
df_balance.to_csv('Prepared Frames/company_data.csv', index=False)

In [56]:
# Saving the list of dropped companies for later:
tickers_to_drop = droppers1 + droppers2
len(tickers_to_drop)

358

# Cashflow Statements

In [57]:
df_cashflow = pd.read_csv('./Raw Data/us-cashflow-quarterly.csv', delimiter= ';')

In [58]:
display(df_cashflow.head(1))

Unnamed: 0,Ticker,SimFinId,Currency,Fiscal Year,Fiscal Period,Report Date,Publish Date,Restated Date,Shares (Basic),Shares (Diluted),...,Net Cash from Operating Activities,Change in Fixed Assets & Intangibles,Net Change in Long Term Investment,Net Cash from Acquisitions & Divestitures,Net Cash from Investing Activities,Dividends Paid,Cash from (Repayment of) Debt,Cash from (Repurchase of) Equity,Net Cash from Financing Activities,Net Change in Cash
0,A,45846,USD,2017,Q1,2017-01-31,2017-03-08,2018-03-06,322000000.0,326000000.0,...,116000000.0,-32000000.0,,-69000000.0,-101000000.0,-42000000.0,89000000.0,-93000000.0,-58000000.0,-48000000


In [59]:
df_cashflow.shape

(42894, 28)

## Dropping sorted out companies

In [60]:
# We already dropped some companies, that we cannot use from the balance sheet dataframe, we will drop them also here and then assess
# the situation
for ticker in tickers_to_drop:
    df_cashflow = df_cashflow[df_cashflow['Ticker'] != ticker]

In [61]:
df_cashflow.shape

(37914, 28)

## Dropping Unneccesary Columns

In [62]:
# We have again a lot of unneccessare columns, some of it even data, that we have in the balance sheet, so we drop them

In [63]:
to_drop = [
    'SimFinId','Currency','Report Date', 'Publish Date', 'Restated Date', 'Shares (Basic)',
    'Shares (Diluted)',
]

In [64]:
df_cashflow = df_cashflow.drop(to_drop, axis = 1)
print(df_cashflow.shape)
print(df_cashflow.columns)

(37914, 21)
Index(['Ticker', 'Fiscal Year', 'Fiscal Period', 'Net Income/Starting Line',
       'Depreciation & Amortization', 'Non-Cash Items',
       'Change in Working Capital', 'Change in Accounts Receivable',
       'Change in Inventories', 'Change in Accounts Payable',
       'Change in Other', 'Net Cash from Operating Activities',
       'Change in Fixed Assets & Intangibles',
       'Net Change in Long Term Investment',
       'Net Cash from Acquisitions & Divestitures',
       'Net Cash from Investing Activities', 'Dividends Paid',
       'Cash from (Repayment of) Debt', 'Cash from (Repurchase of) Equity',
       'Net Cash from Financing Activities', 'Net Change in Cash'],
      dtype='object')


## Checking NaNs

In [65]:
df_cashflow.isna().sum()

Ticker                                           0
Fiscal Year                                      0
Fiscal Period                                    0
Net Income/Starting Line                       460
Depreciation & Amortization                    838
Non-Cash Items                                 458
Change in Working Capital                      653
Change in Accounts Receivable                35480
Change in Inventories                        36150
Change in Accounts Payable                   35920
Change in Other                              35087
Net Cash from Operating Activities               2
Change in Fixed Assets & Intangibles           723
Net Change in Long Term Investment           24396
Net Cash from Acquisitions & Divestitures    20828
Net Cash from Investing Activities             198
Dividends Paid                               20970
Cash from (Repayment of) Debt                 6214
Cash from (Repurchase of) Equity              7111
Net Cash from Financing Activit

In [66]:
# We have some columns, that almost entirely consist out of NaNs, we drop them:
to_drop = [
    'Change in Accounts Receivable','Change in Inventories','Change in Accounts Payable', 'Change in Other'
]

In [67]:
df_cashflow = df_cashflow.drop(to_drop, axis = 1)
print(df_cashflow.shape)
print(df_cashflow.columns)

(37914, 17)
Index(['Ticker', 'Fiscal Year', 'Fiscal Period', 'Net Income/Starting Line',
       'Depreciation & Amortization', 'Non-Cash Items',
       'Change in Working Capital', 'Net Cash from Operating Activities',
       'Change in Fixed Assets & Intangibles',
       'Net Change in Long Term Investment',
       'Net Cash from Acquisitions & Divestitures',
       'Net Cash from Investing Activities', 'Dividends Paid',
       'Cash from (Repayment of) Debt', 'Cash from (Repurchase of) Equity',
       'Net Cash from Financing Activities', 'Net Change in Cash'],
      dtype='object')


In [68]:
df_cashflow.isna().sum()

Ticker                                           0
Fiscal Year                                      0
Fiscal Period                                    0
Net Income/Starting Line                       460
Depreciation & Amortization                    838
Non-Cash Items                                 458
Change in Working Capital                      653
Net Cash from Operating Activities               2
Change in Fixed Assets & Intangibles           723
Net Change in Long Term Investment           24396
Net Cash from Acquisitions & Divestitures    20828
Net Cash from Investing Activities             198
Dividends Paid                               20970
Cash from (Repayment of) Debt                 6214
Cash from (Repurchase of) Equity              7111
Net Cash from Financing Activities             217
Net Change in Cash                               0
dtype: int64

## Filling missing values with mean
We will try to implement the same strategy as before and fill the missing values with the means for the same company.

In [69]:
# The other columns, will fill the NaN's with the mean value for the corresponding company
def get_mean(company,column):
    return np.mean(df_cashflow[ (df_cashflow['Ticker']==company) & (df_cashflow[column].isna() == False) ][column])

In [70]:
# Columns to iterate thorugh
columns = ['Net Income/Starting Line','Depreciation & Amortization','Non-Cash Items','Change in Working Capital','Net Cash from Operating Activities',
          'Change in Fixed Assets & Intangibles','Net Change in Long Term Investment','Net Cash from Acquisitions & Divestitures',
          'Net Cash from Investing Activities','Dividends Paid','Cash from (Repayment of) Debt','Cash from (Repurchase of) Equity',
          'Net Cash from Financing Activities']

# Getting a list with all company tickers
companylist = list(df_cashflow['Ticker'].unique())

In [71]:
# setting mean for missing NaN
for column in columns:
    print(column)
    for company in companylist:
        df_cashflow.loc[ (df_cashflow[column].isna()) & (df_cashflow['Ticker']==company), [column]] = get_mean(company,column)

Net Income/Starting Line
Depreciation & Amortization
Non-Cash Items
Change in Working Capital
Net Cash from Operating Activities
Change in Fixed Assets & Intangibles
Net Change in Long Term Investment
Net Cash from Acquisitions & Divestitures
Net Cash from Investing Activities
Dividends Paid
Cash from (Repayment of) Debt
Cash from (Repurchase of) Equity
Net Cash from Financing Activities


In [72]:
df_cashflow.isna().sum()

Ticker                                           0
Fiscal Year                                      0
Fiscal Period                                    0
Net Income/Starting Line                       233
Depreciation & Amortization                    330
Non-Cash Items                                 210
Change in Working Capital                      228
Net Cash from Operating Activities               0
Change in Fixed Assets & Intangibles            78
Net Change in Long Term Investment           17751
Net Cash from Acquisitions & Divestitures     7829
Net Cash from Investing Activities               3
Dividends Paid                               17902
Cash from (Repayment of) Debt                 1604
Cash from (Repurchase of) Equity              1752
Net Cash from Financing Activities               2
Net Change in Cash                               0
dtype: int64

In [73]:
# We still have tweo columns wich make out nearle half our dataframe with NaNs, we drop them too.
df_cashflow = df_cashflow.drop(['Net Change in Long Term Investment','Dividends Paid'],axis = 1)
df_cashflow.isna().sum()

Ticker                                          0
Fiscal Year                                     0
Fiscal Period                                   0
Net Income/Starting Line                      233
Depreciation & Amortization                   330
Non-Cash Items                                210
Change in Working Capital                     228
Net Cash from Operating Activities              0
Change in Fixed Assets & Intangibles           78
Net Cash from Acquisitions & Divestitures    7829
Net Cash from Investing Activities              3
Cash from (Repayment of) Debt                1604
Cash from (Repurchase of) Equity             1752
Net Cash from Financing Activities              2
Net Change in Cash                              0
dtype: int64

In [74]:
# For the column Net Cash from Acquisitions & Divestitures, 0 is a plausible value, since not every company makes an aquisition in every 
# quarter or sells part of its company. so we fill the missing values with 0.
# The same goes for Cash from (Repayment of) Debt ,Cash from (Repurchase of) Equity and Net Cash from financing activities.
df_cashflow.loc[df_cashflow['Net Cash from Acquisitions & Divestitures'].isna(),['Net Cash from Acquisitions & Divestitures']] = 0
df_cashflow.loc[df_cashflow['Cash from (Repayment of) Debt'].isna(),['Cash from (Repayment of) Debt']] = 0
df_cashflow.loc[df_cashflow['Cash from (Repurchase of) Equity'].isna(),['Cash from (Repurchase of) Equity']] = 0
df_cashflow.loc[df_cashflow['Net Cash from Financing Activities'].isna(),['Net Cash from Financing Activities']] = 0
df_cashflow.loc[df_cashflow['Net Cash from Investing Activities'].isna(),['Net Cash from Investing Activities']] = 0
df_cashflow.isna().sum()

Ticker                                         0
Fiscal Year                                    0
Fiscal Period                                  0
Net Income/Starting Line                     233
Depreciation & Amortization                  330
Non-Cash Items                               210
Change in Working Capital                    228
Net Cash from Operating Activities             0
Change in Fixed Assets & Intangibles          78
Net Cash from Acquisitions & Divestitures      0
Net Cash from Investing Activities             0
Cash from (Repayment of) Debt                  0
Cash from (Repurchase of) Equity               0
Net Cash from Financing Activities             0
Net Change in Cash                             0
dtype: int64

In [75]:
# We look for which companies are making up the rest of the NaN
affected = []
columns = df_cashflow.columns
for column in columns:
    affected.append(df_cashflow[df_cashflow[column].isna()]['Ticker'].unique())
# Making a list of the single entries instead of an array
droppers3 = []
for i in range(len(affected)):
    for x in affected[i]:
        droppers3.append(x)
# Removing duplicates
droppers3 = list(set(droppers3))
len(droppers3)

36

In [76]:
# We can afford to loose these companies:
for ticker in droppers3:
    df_cashflow = df_cashflow[df_cashflow['Ticker'] != ticker]
df_cashflow.shape

(37499, 15)

In [77]:
# Since we have those companies still in the Balance dataframe we drop them there too:
for ticker in droppers3:
    df_balance = df_balance[df_balance['Ticker'] != ticker]
df_balance.shape

(37222, 19)

# Income Statements

In [78]:
df_income = pd.read_csv('./Raw Data/us-income-quarterly.csv', delimiter= ';')

In [79]:
df_income.columns

Index(['Ticker', 'SimFinId', 'Currency', 'Fiscal Year', 'Fiscal Period',
       'Report Date', 'Publish Date', 'Restated Date', 'Shares (Basic)',
       'Shares (Diluted)', 'Revenue', 'Cost of Revenue', 'Gross Profit',
       'Operating Expenses', 'Selling, General & Administrative',
       'Research & Development', 'Depreciation & Amortization',
       'Operating Income (Loss)', 'Non-Operating Income (Loss)',
       'Interest Expense, Net', 'Pretax Income (Loss), Adj.',
       'Abnormal Gains (Losses)', 'Pretax Income (Loss)',
       'Income Tax (Expense) Benefit, Net',
       'Income (Loss) from Continuing Operations',
       'Net Extraordinary Gains (Losses)', 'Net Income',
       'Net Income (Common)'],
      dtype='object')

## Dropping unneccessary columns
The same as above

In [80]:
to_drop = [
    'SimFinId','Currency','Report Date', 'Publish Date', 'Restated Date', 'Shares (Basic)',
    'Shares (Diluted)',
]

In [81]:
df_income = df_income.drop(to_drop,axis = 1)

## Dropping companies as before
Since we already excluded some companies from our analysis, there is no point in keeping them here.

In [82]:
droppers_all = droppers1+droppers2+droppers3

In [83]:
for ticker in droppers_all:
    df_income = df_income[df_income['Ticker'] != ticker]

## Searching for NaNs

In [84]:
df_income.isna().sum()

Ticker                                          0
Fiscal Year                                     0
Fiscal Period                                   0
Revenue                                       589
Cost of Revenue                              3047
Gross Profit                                 3071
Operating Expenses                             51
Selling, General & Administrative            1776
Research & Development                      21130
Depreciation & Amortization                 21117
Operating Income (Loss)                         0
Non-Operating Income (Loss)                   133
Interest Expense, Net                        3877
Pretax Income (Loss), Adj.                      0
Abnormal Gains (Losses)                     15984
Pretax Income (Loss)                            0
Income Tax (Expense) Benefit, Net            3042
Income (Loss) from Continuing Operations        0
Net Extraordinary Gains (Losses)            33173
Net Income                                      0


In [85]:
# We still have some NaNs to clean up, two columns make up more than 50% NaNs, we drop them.
df_income = df_income.drop(['Research & Development','Depreciation & Amortization'], axis = 1)

In [86]:
# Also some columns are for extraordinary items, we assume that since they are missing there were no extraordinary events
# and fill them with 0.
extras = ['Abnormal Gains (Losses)','Net Extraordinary Gains (Losses)']
for column in extras:
    df_income[column] = 0

In [87]:
# For the rest we again fill with the companywise mean
# The other columns, will fill the NaN's with the mean value for the corresponding company
def get_mean(company,column):
    return np.mean(df_income[ (df_income['Ticker']==company) & (df_income[column].isna() == False) ][column])

In [88]:
df_income.isna().sum()

Ticker                                         0
Fiscal Year                                    0
Fiscal Period                                  0
Revenue                                      589
Cost of Revenue                             3047
Gross Profit                                3071
Operating Expenses                            51
Selling, General & Administrative           1776
Operating Income (Loss)                        0
Non-Operating Income (Loss)                  133
Interest Expense, Net                       3877
Pretax Income (Loss), Adj.                     0
Abnormal Gains (Losses)                        0
Pretax Income (Loss)                           0
Income Tax (Expense) Benefit, Net           3042
Income (Loss) from Continuing Operations       0
Net Extraordinary Gains (Losses)               0
Net Income                                     0
Net Income (Common)                            0
dtype: int64

In [89]:
# Columns to iterate thorugh
columns = ['Revenue','Cost of Revenue','Gross Profit','Operating Expenses','Selling, General & Administrative',
           'Non-Operating Income (Loss)','Interest Expense, Net','Income Tax (Expense) Benefit, Net']

# Getting a list with all company tickers
companylist = list(df_income['Ticker'].unique())

In [90]:
# setting mean for missing NaN
for column in columns:
    print(column)
    for company in companylist:
        df_income.loc[ (df_income[column].isna()) & (df_income['Ticker']==company), [column]] = get_mean(company,column)

Revenue
Cost of Revenue
Gross Profit
Operating Expenses
Selling, General & Administrative
Non-Operating Income (Loss)
Interest Expense, Net
Income Tax (Expense) Benefit, Net


In [91]:
df_income.isna().sum()

Ticker                                         0
Fiscal Year                                    0
Fiscal Period                                  0
Revenue                                      125
Cost of Revenue                             1675
Gross Profit                                1678
Operating Expenses                            19
Selling, General & Administrative           1390
Operating Income (Loss)                        0
Non-Operating Income (Loss)                   21
Interest Expense, Net                       2430
Pretax Income (Loss), Adj.                     0
Abnormal Gains (Losses)                        0
Pretax Income (Loss)                           0
Income Tax (Expense) Benefit, Net           1440
Income (Loss) from Continuing Operations       0
Net Extraordinary Gains (Losses)               0
Net Income                                     0
Net Income (Common)                            0
dtype: int64

At this point I will not delete any more companies and see if there is still a problem after choosing the companies for wich we have rating data.

#### Saving our dataframes again:


In [92]:
df_industries.to_csv('Prepared Frames/industry_data.csv', index=False)
df_companies.to_csv('Prepared Frames/company_data.csv', index=False)
df_balance.to_csv('Prepared Frames/balance_data.csv', index=False)
df_cashflow.to_csv('Prepared Frames/cashflow_data.csv', index=False)
df_income.to_csv('Prepared Frames/income_data.csv', index=False)