# Download and transform all holdings data

Download holding data from ETF-provider in native format: csv, xls, xlsx

Extract holdings and map to its unique identifier: ISIN

Try to extract as much information as possible and push as transformed data to S3.

## Problems:
iShares changed columns provided for holdings. ISIN column no longer available. Need to map over name or (Symbol, Exchange).
This is really cumbersome and can be a serious problem of data quality. 

Work in progress.






In [1]:
# General
import pandas as pd
from datetime import datetime
import random
import time

# File download
import json
import urllib3
from tqdm.notebook import tqdm, trange

# AWS S3 Storage
import boto3

# Custom modules
from FundDataScraper import readData_ETF, cleanData_ETF
import Tools

# Hidden configurations
from mySecrets import config_file, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_BUCKET


In [2]:
# Read config table for (ISIN / Provider) mapping
config = pd.read_excel(config_file, sheet_name = "ETF_Overview", header = 1)

# Select releavant columns
config = config[['Security_ISIN', 'Fund_Company', 'Type', 'URL_Positions', 'URL_Positions_FileType']]

# Drop rows with incomplete scraping information
config = config.dropna(subset = ['URL_Positions', 'URL_Positions_FileType'])

config.head()

Unnamed: 0,Security_ISIN,Fund_Company,Type,URL_Positions,URL_Positions_FileType
12,IE00B8KGV557,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,csv
13,IE00B86MWN23,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,csv
14,IE00B8FHGS14,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,csv
15,IE00B6SPMN59,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,csv
16,IE00B27YCP72,iShares,Aktien,https://www.ishares.com/de/privatanleger/de/pr...,csv


In [3]:
# Connect to AWS S3 storage
s3 = Tools.S3()
s3.connect(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

Connected.


In [7]:
# Download data from brokers and store on S3

date = datetime.now().strftime('%Y_%m_%d')

for i, record in tqdm(config.iterrows(), total = config.shape[0]):

    isin, url, file_suffix = record[['Security_ISIN', 'URL_Positions', 'URL_Positions_FileType']]

    # File name -> date defines folder structure
    filename = 'data/position_raw/' + date + "/" + isin + "." + file_suffix

    # Download file to s3 storage
    http = urllib3.PoolManager()
    s3.client.upload_fileobj(
        http.request('GET', url, preload_content = False), 
        "pomato", 
        filename
    )

    # Slowdown process to not get blocked
    time_to_sleep = random.choices(range(1,6), k = 1, weights = [i for i in reversed(range(1, 6))])
    time.sleep(time_to_sleep[0])

  0%|          | 0/277 [00:00<?, ?it/s]

In [8]:
# Download data from S3, transform and upload transformed version

date = datetime.now().strftime('%Y_%m_%d')

# Get list of avilable files -> do not know yet, if all uploads were successful
file_list = s3.listFiles(AWS_BUCKET, "data/position_raw/" + date)

# Subset config file to available data
df_file = pd.DataFrame(file_list, columns = ['key_raw'])
df_file['Security_ISIN'] = df_file['key_raw'].str.extract(r"([A-Z0-9]{12})", expand = False).fillna('no match')
config = config.join(df_file.set_index(['Security_ISIN']), on = 'Security_ISIN', how = 'inner')

# Read mapping table - required for (symbol, exchange) to (isin) mapping. iShares special case
map_isin_symbol = s3.downloadFile(
        AWS_BUCKET, 
        'config/mapping/map_cs_isin_sym.tsv', 
        './temp/' + 'map_cs_isin_sym.tsv'
    )

map_isin_symbol = pd.read_csv('./temp/map_cs_isin_sym.tsv', sep = "\t")
map_isin_symbol = map_isin_symbol[['ISIN', 'Symbol', 'Exchange']].rename(columns = {'Symbol': 'TICKER', 'Exchange': 'EXCHANGE'})

# Download raw files, extract information and clean
for i, record in tqdm(config.iterrows(), total = config.shape[0]):

    key, isin, fund_company, file_suffix = record[['key_raw', 'Security_ISIN', 'Fund_Company', 'URL_Positions_FileType']]

    filename = isin + '.' + file_suffix

    s3.downloadFile(AWS_BUCKET, key, './temp/' + filename)

    # Read file and harmonize column names
    try:
        fund_requests, data = readData_ETF(fund_company, './temp/', filename, isin)
    except:
        print("Error - " + isin)

    # In 2021, iShares dropped the ISIN column. New key: (TICKER, EXCHANGE)
    if not ('ISIN' in data.columns.to_list()):
        
        data = data.join(map_isin_symbol.set_index(['TICKER', 'EXCHANGE']), on = ['TICKER', 'EXCHANGE'])

    # Clean data (e.g. aggregate over duplicates, unknown securities; harmonize 'TICKER', 'NAME', 'SECTOR', 'EXCHANGE', 'COUNTRY', 'CCY')
    # data = cleanData_ETF(data)
  
    # Adjust WEIGHTS to 1 - they can be 100 % / 1 / 0
    # data['WEIGHT'] = data['WEIGHT'] / data['WEIGHT'].sum()

    # Upload to S3
    Tools.uploadFileS3(s3, data, isin, "data/position_trans/" + date + "/", AWS_BUCKET) 


  0%|          | 0/278 [00:00<?, ?it/s]

Error - LU0455009935
Error - LU0455009851
Error - LU0455008887
Error - LU0455009182
Error - LU0514695856
Error - LU0476289896
Error - LU0659580236
Error - LU0514694537
Error - LU0592215668
Error - LU0455009265
Error - LU0755279428
Error - LU0514694966
Error - LU0975326215
Error - LU0975334821
Error - LU1409136006
Error - LU0962081203
Error - LU0356591882
Error - LU1574142243
Error - IE00BYPLS672
Error - IE00BYPLS672
