# Distributions

Collect and extract data about distributions from different sources


## Data Sources
### iShares
Data provided as easy download. The file itself is an xml disguised as an xls. Extraction a bit tricky. See extractFromXLM* functions for details.

### XTrackers
Data not easy to download. Probably needs to be scraped from website. Postponed.

### Lyxor
Data easy to download. And to extract.

In [27]:
# General
import pandas as pd
from datetime import datetime
import random
import time

# File download
import json
import urllib3
from tqdm.notebook import tqdm, trange

# AWS S3 Storage
import boto3

# Custom modules
from Tools import S3, extractFromXML_read_file, extractFromXML_trans_df

# Hidden configurations
from mySecrets import config_file, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_BUCKET


In [3]:
# Read config table for (ISIN / Provider) mapping
config = pd.read_excel(config_file, sheet_name = "ETF_Overview", header = 1)

# iShares provides Dividends with the overview file
idx_ishares = config.Fund_Company == 'iShares'

config.loc[idx_ishares,'URL_Dividend'] = config[idx_ishares]['URL_Overview']
config.loc[idx_ishares,'URL_Dividend_FileType'] = config[idx_ishares]['URL_Overview_FileType']

# Select releavant columns
config = config[['Security_ISIN', 'Fund_Company', 'Type', 'URL_Dividend', 'URL_Dividend_FileType']]

# Drop rows with incomplete scraping information
config = config.dropna(subset = ['URL_Dividend', 'URL_Dividend_FileType'])

config.head()

Unnamed: 0,Security_ISIN,Fund_Company,Type,URL_Dividend,URL_Dividend_FileType
12,IE00B8KGV557,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls
13,IE00B86MWN23,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls
14,IE00B8FHGS14,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls
15,IE00B6SPMN59,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls
16,IE00B27YCP72,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls


In [1]:
# Connect to AWS S3 storage
s3 = S3()
s3.connect(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [18]:
# Download data from brokers and store on S3

# This code block takes a long time to execute. Uncommented to prohibit unwanted execution.

# date = datetime.now().strftime('%Y_%m_%d')

# for i, record in tqdm(config.iterrows(), total = config.shape[0]):

#     isin, url, file_suffix = record[['Security_ISIN', 'URL_Dividend', 'URL_Dividend_FileType']]

#     # File name -> date defines folder structure
#     filename = 'data/distribution_raw/' + date + "/" + isin + "." + file_suffix

#     # Download file to s3 storage
#     http = urllib3.PoolManager()
#     s3.client.upload_fileobj(
#         http.request('GET', url, preload_content = False), 
#         "pomato", 
#         filename
#     )

#     # Slowdown process to not get blocked
#     time_to_sleep = random.choices(range(1,6), k = 1, weights = [i for i in reversed(range(1, 6))])
#     time.sleep(time_to_sleep[0])

  0%|          | 0/347 [00:00<?, ?it/s]

In [23]:
# Download data from S3, transform and upload transformed version

date = "2022_03_24"

# Get list of avilable files -> do not know yet, if all uploads were successful
file_list = s3.listFiles(AWS_BUCKET, "data/distribution_raw/" + date)

# Subset config file to available data
df_file = pd.DataFrame(file_list, columns = ['key_raw'])
df_file['Security_ISIN'] = df_file['key_raw'].str.extract(r"([A-Z0-9]{12})", expand = False).fillna('no match')
config_sub = config.join(df_file.set_index(['Security_ISIN']), on = 'Security_ISIN', how = 'inner')

config_sub.head()

for i, record in tqdm(config_sub.iterrows(), total = config_sub.shape[0]):

    key, isin, fund_company, file_suffix = record[['key_raw', 'Security_ISIN', 'Fund_Company', 'URL_Dividend_FileType']]

    filename = isin + '.' + file_suffix

    s3.downloadFile(AWS_BUCKET, key, './temp/' + filename)

Unnamed: 0,Security_ISIN,Fund_Company,Type,URL_Dividend,URL_Dividend_FileType,key_raw
12,IE00B8KGV557,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls,data/distribution_raw/2022_03_24/IE00B8KGV557.xls
13,IE00B86MWN23,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls,data/distribution_raw/2022_03_24/IE00B86MWN23.xls
14,IE00B8FHGS14,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls,data/distribution_raw/2022_03_24/IE00B8FHGS14.xls
15,IE00B6SPMN59,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls,data/distribution_raw/2022_03_24/IE00B6SPMN59.xls
16,IE00B27YCP72,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,xls,data/distribution_raw/2022_03_24/IE00B27YCP72.xls


In [24]:
# Transform data and upload
for i, record in tqdm(config_sub.iterrows(), total = config_sub.shape[0]):

    key, isin, fund_company, file_suffix = record[['key_raw', 'Security_ISIN', 'Fund_Company', 'URL_Dividend_FileType']]

    filename = 'temp/' + isin + '.' + file_suffix

    if fund_company == "iShares":

        # Define parameters for xml reader
        ns = {"ss": "urn:schemas-microsoft-com:office:spreadsheet"}
        ws_config = {
            'Ausschüttungen': {'row_start': 0}
        }

        # Fix possible xml errors in each file
        # Some xml files have cryptic characters before <?xml...
        with open(filename) as f:
            lines = f.readlines()

        # Remove cryptic charactrs
        lines[0] = '<?xml version="1.0"?>\n'

        # Remove '&' and similar
        lines = [l.replace("&", "") for l in lines]

        with open(filename, "w") as f:
            f.writelines(lines)

        # Try to extract data from xml files
        try:
            dict_tables = extractFromXML_read_file(filename, ws_config, ns)

            # Not all funds distribute -> Ausschüttungen can be an empty dict
            if dict_tables:

                temp_dict = dict_tables['Ausschüttungen']

                #Tables can have different number of columns
                cols = temp_dict[0]

                df_temp = pd.DataFrame(temp_dict[1:], columns = cols)

                # Tables can have different column names
                df_temp.rename(columns = {'Ex-Tag': 'Dist_Date_Ex', 'Gesamtausschüttung': 'Dist_Amount'}, inplace = True)
                df_temp['Security_ISIN'] = isin

                # Upload to S3
                Tools.uploadFileS3(s3, df_temp, isin, "data/distribution_trans/" + date + "/", AWS_BUCKET) 
        except:
            print((isin, "Error"))

    elif fund_company == "Lyxor":
        # Read xls file
        df_temp = pd.read_excel(filename, skiprows=8)

        # Assign new column names. They are empty spaces, linebreaks and other.
        # That is too messy.
        df_temp.columns = ['Security_Name', 'Security_ISIN', 'Dist_Date_Ex', 'Dist_CCY', 'Dist_Amount']

        # Drop empty rows - actually empty or comments and irrelevant data
        df_temp = df_temp.dropna(subset = ['Security_Name', 'Security_ISIN'], axis = 'rows', how = 'any')

        # Upload to S3
        Tools.uploadFileS3(s3, df_temp, isin, "data/distribution_trans/" + date + "/", AWS_BUCKET) 
        
    else: 
        print(("Error - not supported: ", fund_company))

  0%|          | 0/347 [00:00<?, ?it/s]