In [None]:
# default_exp final

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
# nbdev function - ensures that changed libraries from the project are reloaded
%load_ext autoreload
%autoreload 2

# Final preparations

In [None]:
# imports
from bfh_mt_hs2020_sec_data.core import * 
from pathlib import Path
from typing import List, Tuple, Union, Set
import glob

import pandas as pd
import numpy as np

import shutil          # provides high level file operations
import time            # used to measure execution time
import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from yahoo_historical import Fetcher
import yfinance as yf

In [None]:
all_data_local_folder = "./data/"
stock_data_folder = "D:/data/stocks/sec/"
training_set_folder = "D:/data/mt/"

join_group = ["cik","ticker","adsh","period","filed","form","fp"]

## 00_Tools

In [None]:
def load_sec_features():
    return pd.read_csv(all_data_local_folder + "07_all_features_complete_corrected.csv", header = 0)

In [None]:
def read_additional_info() -> pd.DataFrame:
    df =  pd.read_csv(all_data_local_folder + "08_add_ticker_info.csv", sep=',', encoding='utf-8', header=0)
    df =  df[df.message.isna()]
    return df.reset_index(drop=True)

In [None]:
# create a list with all stockdata files
def get_stock_data_files() -> List[str]:
    return glob.glob(stock_data_folder + "**/*_processed.csv",recursive = True)

In [None]:
def get_ticker_from_filename(filename: str) -> str:
    return filename[len(stock_data_folder) + 2 : -14]

## 01_additional company information

### Sector and Industries
Important features are the industry and the sector a company works in and in order to use these during training, wie have to one-hot-encode them.
There are about 12 different sectors and about 142 different industries in the data.

In [None]:
add_info = read_additional_info()
print("length of dataset:           ", add_info.shape[0])
print("number of unique sectors:    ", len(add_info.sector.unique()))
print("number of unique industries: ",len(add_info.industry.unique()))

length of dataset:            2065
number of unique sectors:     12
number of unique industries:  142


In [None]:
add_info['sector'] = add_info.sector.str.replace(' ','')
add_info = pd.concat([add_info, pd.get_dummies(add_info.sector, prefix='sec_')], axis=1)

it doesn't make sense to hot encode all 140 industries, we will only use industries which appear more than 20 times in the data (at least in one percent of the data). Othe industries will be set to "other"

In [None]:
add_info['industry'] = add_info.industry.str.replace(' ','')
add_info['industry'] = add_info.industry.str.replace('&','')
add_info['industry'] = add_info.industry.str.replace('—','')

In [None]:
df_industries = add_info.industry.value_counts()
df_relevant_industries = df_industries[df_industries > 20].index.tolist() # create a list with industries that appear at least 20 times in the data
add_info.loc[~add_info.industry.isin(df_relevant_industries),'industry'] = "Other"

In [None]:
add_info = pd.concat([add_info, pd.get_dummies(add_info.industry, prefix='ind_')], axis=1)
add_info.shape

(2065, 47)

### Market Capitalization
We will use the Market Capitalization to rank the companies and then create features like top 10, top 100, top 200, ...

In [None]:
add_info['marketCap_rank'] = add_info.marketCap.rank(ascending=False)

In [None]:
add_info['mc_top10'] = 0.0
add_info['mc_top20'] = 0.0
add_info['mc_top30'] = 0.0
add_info['mc_top50'] = 0.0
add_info['mc_top100'] = 0.0
add_info['mc_top200'] = 0.0
add_info['mc_top500'] = 0.0
add_info['mc_top1000'] = 0.0

add_info.loc[add_info.marketCap_rank <=   10, 'mc_top10']   = 1.0
add_info.loc[add_info.marketCap_rank <=   20, 'mc_top20']   = 1.0
add_info.loc[add_info.marketCap_rank <=   30, 'mc_top30']   = 1.0
add_info.loc[add_info.marketCap_rank <=   50, 'mc_top50']   = 1.0
add_info.loc[add_info.marketCap_rank <=  100, 'mc_top100']  = 1.0
add_info.loc[add_info.marketCap_rank <=  200, 'mc_top200']  = 1.0
add_info.loc[add_info.marketCap_rank <=  500, 'mc_top500']  = 1.0
add_info.loc[add_info.marketCap_rank <= 1000, 'mc_top1000'] = 1.0

add_info.shape

(2065, 56)

### Drop unnecessary columns and save

In [None]:
add_info = add_info.drop(['message', 'marketCap_rank', 'sector', 'industry'], axis=1)
add_info.to_csv(all_data_local_folder + '09_add_ticker_info_features.csv', header=True, index=False)

## 02_Company Reports

In [None]:
df_sec = load_sec_features()

### Check Ticker and CIK in 07_all_features_complete_corrected.csv
it could be that the same Ticker-Symbol is used for several CIKs. This could happen, if a company is delisted or if companies are part of a holding. However, we have to ensure that the mapping between the ticker and cik we are using is unambiguous.

In [None]:
df_ciktik = df_sec[['cik','ticker']]
df_ciktik_unique = df_ciktik.drop_duplicates()

In [None]:
df_cik_to_tickers = df_ciktik_unique['ticker'].value_counts()
# this shows how often a ticker is is used for how many ciks
# what we would want to see is one row starting with "1".  
# If there multiple rows, wie now that there are tickers which seem to be used for different ciks.
df_cik_to_tickers.value_counts() 

1    3050
Name: ticker, dtype: int64

In [None]:
used_mulitple_times = df_cik_to_tickers[df_cik_to_tickers > 1].index.to_list()
used_mulitple_times

[]

In [None]:
df_ciktik_unique[df_ciktik_unique.ticker.isin(used_mulitple_times)].sort_values('ticker')

Unnamed: 0,cik,ticker


### add one hot_encoding for financial period

In [None]:
df_sec = pd.concat([df_sec, pd.get_dummies(df_sec.fp, prefix='fp_')], axis=1)
df_sec.shape

(94599, 58)

In [None]:
df_sec.to_csv(all_data_local_folder + '09_company_reports.csv', header=True, index=False)

## 05_Check completeness of data
there are now 3 datasets with features
- 07_all_feature_complete.csv contains comparable data from the business reports
- [ticker]_processed.csv contain the historical stockdata
- 09_add_ticker_info_features.csv contains additional information for a company

for the training, we can only use the data of companies that are present in all 3 datasets 

In [None]:
sd_files = get_stock_data_files()
ticker_sd_files = [get_ticker_from_filename(x) for x in sd_files]

In [None]:
set_ticker_add_info = set(add_info.ticker.unique())
set_ticker_df_sec = set(df_sec.ticker.unique())
set_ticker_sd = set(ticker_sd_files)

print(len(set_ticker_add_info))
print(len(set_ticker_df_sec))
print(len(set_ticker_sd))

2065
3050
2294


In [None]:
# define which tickers are present in all 3 datasets
intersected = set_ticker_add_info.intersection(set_ticker_df_sec).intersection(set_ticker_sd)
len(intersected)

2053

### save/copy data of companies present in all 3 datasets to a special folder

In [None]:
add_info_cleaned = add_info[add_info.ticker.isin(intersected)].sort_values('ticker')
add_info_cleaned.to_csv(training_set_folder + "company_info.csv", header=True, index=False)

In [None]:
df_sec_cleaned = df_sec[df_sec.ticker.isin(intersected)].sort_values(['ticker','period'])
df_sec_cleaned.to_csv(training_set_folder + "company_reports.csv", header=True, index=False)

In [None]:
import shutil
import string
for char in string.ascii_uppercase:
    folder = training_set_folder + "stocks/" + char + "/"
    directory = os.path.dirname(folder)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
for file in sd_files:
    ticker = get_ticker_from_filename(file)
    if ticker in intersected:
        shutil.copy(file, training_set_folder + "stocks/" + ticker[0] + "/" + ticker +".csv")

## 06_Combine Stockdata into a single dataset
In order to be able to normalize the stockdata, we need to have them all in a single dataset. <br>
Sice we don't know how we will use them, we store the data with spark as a parquet dataset and a pure csv dataset.

In [None]:
# init Spark
spark = get_spark_session() # Session anlegen
spark # display the moste important information of the session

In [None]:
start = time.time()
df_all = spark.read.csv(training_set_folder + "stocks/*/*", header=True, dateFormat="yyyy-MM-dd")

In [None]:
from pyspark.sql.functions import col
df_all_partioned = df_all.repartition(16, col("ticker"))
df_all_partioned.write.parquet(training_set_folder + "/all_stock_data")

In [None]:
pd_df = df_all.toPandas()

In [None]:
pd_df.sort_values(['ticker','Date']).to_csv(training_set_folder + "all_stock_data.csv", header=True, index=False)

In [None]:
spark.stop()

## XX_Trials

In [None]:
print(len(add_info.sector.unique()))
print(len(add_info.industry.unique()))

12
142


In [None]:
add_info.sector.value_counts()

FinancialServices        375
Industrials              324
Technology               293
Healthcare               264
ConsumerCyclical         235
RealEstate               131
Energy                   115
ConsumerDefensive        102
BasicMaterials            85
CommunicationServices     81
Utilities                 54
IndustrialGoods            1
Name: sector, dtype: int64

In [None]:
industries = add_info.industry.value_counts()
industries[industries > 20].index.tolist()

['Other',
 'BanksRegional',
 'Biotechnology',
 'SoftwareApplication',
 'SpecialtyIndustrialMachinery',
 'OilGasEP',
 'Semiconductors',
 'MedicalDevices',
 'InformationTechnologyServices',
 'PackagedFoods',
 'InsurancePropertyCasualty',
 'SpecialtyChemicals',
 'SoftwareInfrastructure',
 'MedicalInstrumentsSupplies',
 'AutoParts',
 'CommunicationEquipment',
 'SpecialtyRetail',
 'AssetManagement',
 'DiagnosticsResearch',
 'ElectronicComponents',
 'AerospaceDefense',
 'Restaurants',
 'OilGasEquipmentServices',
 'ScientificTechnicalInstruments',
 'CreditServices',
 'OilGasMidstream',
 'SpecialtyBusinessServices',
 'ApparelRetail',
 'SemiconductorEquipmentMaterials']

In [None]:
add_info[add_info.marketCap_rank < 15].sort_values('marketCap_rank')

Unnamed: 0,ticker,sector,industry,marketCap,sharesOutstanding,message,sec__BasicMaterials,sec__CommunicationServices,sec__ConsumerCyclical,sec__ConsumerDefensive,...,ind__SpecialtyRetail,marketCap_rank,mc_top10,mc_top20,mc_top30,mc_top50,mc_top100,mc_top200,mc_top500,mc_top1000
2,AAPL,Technology,Other,2081190000000.0,17102500000.0,,0,0,0,0,...,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1951,MSFT,Technology,SoftwareInfrastructure,1612352000000.0,7560500000.0,,0,0,0,0,...,0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
914,AMZN,ConsumerCyclical,Other,1563667000000.0,500890000.0,,0,0,1,0,...,0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
203,FB,CommunicationServices,Other,779149700000.0,2403970000.0,,0,1,0,0,...,0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
834,TSLA,ConsumerCyclical,Other,578210100000.0,947901000.0,,0,0,1,0,...,0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
61,V,FinancialServices,CreditServices,454637600000.0,1695680000.0,,0,0,0,0,...,0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1699,WMT,ConsumerDefensive,Other,415905600000.0,2829290000.0,,0,0,0,1,...,0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1480,JNJ,Healthcare,Other,402647000000.0,2632540000.0,,0,0,0,0,...,0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
740,JPM,FinancialServices,Other,364442800000.0,3048200000.0,,0,0,0,0,...,0,9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1132,PG,ConsumerDefensive,Other,338491600000.0,2479610000.0,,0,0,0,1,...,0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
industries = add_info.industry.unique()

In [None]:
reit_list = list(filter(lambda k: 'REIT' in k, industries))
real_list = list(filter(lambda k: 'RealEstate' in k, industries))
gasoil_list = list(filter(lambda k: 'Gas' in k, industries))
health_list = list(filter(lambda k: 'Health' in k, industries))
insurance_list = list(filter(lambda k: 'Insurance' in k, industries))
rest = set(industries) - set(reit_list) - set(gasoil_list) - set(real_list) - set(health_list) - set(insurance_list)

In [None]:
print(len(reit_list))
print(len(gasoil_list))
print(len(real_list))
print(len(health_list))
print(len(insurance_list))
print(len(rest))
print(rest)

9
7
3
3
6
115
{'CapitalMarkets', 'ApparelManufacturing', 'FoodDistribution', 'Airlines', 'AutoTruckDealerships', 'DiversifiedMachinery', 'BuildingProductsEquipment', 'Semiconductors', 'LumberWoodProduction', 'Entertainment', 'ResidentialConstruction', 'Lodging', 'Tobacco', 'DrugManufacturersSpecialtyGeneric', 'RecreationalVehicles', 'MortgageFinance', 'InformationTechnologyServices', 'AgriculturalInputs', 'Steel', 'MedicalCareFacilities', 'ElectronicComponents', 'Gold', 'SoftwareApplication', 'UtilitiesRegulatedWater', 'Leisure', 'MarineShipping', 'TravelServices', 'SpecialtyRetail', 'FarmHeavyConstructionMachinery', 'AutoParts', 'ApparelRetail', 'FinancialConglomerates', 'SpecialtyIndustrialMachinery', 'Railroads', 'BeveragesBrewers', 'AirportsAirServices', 'Confectioners', 'TelecomServices', 'Chemicals', 'AssetManagement', 'InternetContentInformation', 'Broadcasting', 'HomeImprovementRetail', 'WasteManagement', 'PaperPaperProducts', 'ShellCompanies', 'LuxuryGoods', 'UtilitiesIndepend

In [None]:
adf = read_additional_info()
adf[adf.message.isna()]

Unnamed: 0,ticker,sector,industry,marketCap,sharesOutstanding,message
0,AAL,Industrials,Airlines,1.041370e+10,6.107740e+08,
1,AAOI,Technology,Semiconductors,1.782953e+08,2.297620e+07,
2,AAPL,Technology,Consumer Electronics,2.081190e+12,1.710250e+10,
3,AAME,Financial Services,Insurance—Life,4.634046e+07,2.041430e+07,
4,AAN,Consumer Cyclical,Specialty Retail,5.985550e+08,3.377850e+07,
...,...,...,...,...,...,...
3037,FDP,Consumer Defensive,Farm Products,1.228697e+09,4.736690e+07,
3039,SCOR,Communication Services,Advertising Agencies,1.805105e+08,7.278650e+07,
3042,MDRX,Healthcare,Health Information Services,2.265462e+09,1.579820e+08,
3044,GILD,Healthcare,Drug Manufacturers—General,7.616448e+10,1.253530e+09,
