# Get mapping tables from different sources

In [3]:
# General
import pandas as pd

# Custom modules
import Tools

# Hidden configurations
from mySecrets import config_file, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_BUCKET


In [2]:
# Connect to AWS S3 storage
s3 = Tools.S3()
s3.connect(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

Connected.


## Config File

Extract mapping and configuration information from the config file and upload for easy access to S3

In [20]:
# ETF Overivew
etf_info = pd.read_excel(config_file, sheet_name = "ETF_Overview", header = 1)
etf_info = etf_info[['Security_ISIN', 'Security_Name', 'Security_Class', 'Fund_Company', 'Domicile', 'Type']]

s3.uploadFile(
    df = etf_info, 
    name = 'etf_info',
    dir = 'config/mapping/',
    bucket = AWS_BUCKET
)    

In [11]:
# Info on equity securities
equity_info = pd.read_excel(config_file, sheet_name = "Equity_Overview", header = 1)
equity_info = equity_info[['Security_ISIN', 'Name', 'Sektor', 'Standort', 'Type', 'ISO3']]

s3.uploadFile(
    df = equity_info, 
    name = 'equity_info',
    dir = 'config/mapping/',
    bucket = AWS_BUCKET
)    

In [16]:
# Mapping from ISO3 to Regions
map_iso3 = pd.read_excel(config_file, sheet_name = "Region", header = 0)
map_iso3 = map_iso3[['Region Name', 'Sub-region Name', 'Country or Area', 'ISO-alpha3 Code']]

# Rename columns
map_iso3.columns = ['region_name', 'region_sub_name', 'country_name', 'ISO3']

s3.uploadFile(
    df = map_iso3, 
    name = 'map_iso3',
    dir = 'config/mapping/',
    bucket = AWS_BUCKET
)    

In [25]:
# ETF ISIN - Symbol

etf_isin_symbol = pd.read_excel(config_file, sheet_name = "ETF_Overview", header = 1)
etf_isin_symbol = etf_isin_symbol[['Security_ISIN', 'Sym_YahooFin', 'Ex_YahooFin', 'CCY_YahooFin']]

s3.uploadFile(
    df = etf_isin_symbol, 
    name = 'etf_isin_symbol',
    dir = 'config/mapping/',
    bucket = AWS_BUCKET
)    


## GLEIF

In [3]:
# ISIN to LEI
# Data Source: https://www.gleif.org/en/lei-data/lei-mapping/download-isin-to-lei-relationship-files#
# Data Size: 224 MB
# Data TSV Size: 283 MB

file_isin_to_lei = "C:/Users/phili/OneDrive/GitHub/Portfolio_Management/mapping/isin-lei-20220308T080155/ISIN_LEI_20220308.csv"

isin_to_lei = pd.read_csv(file_isin_to_lei)

isin_to_lei.to_csv("temp/isin_to_lei.tsv", sep = "\t")


In [4]:
filename_upload = "config/mapping/isin_to_lei.tsv"
filename_temp = "temp/isin_to_lei.tsv"

s3.client.upload_file(
    filename_temp, 
    AWS_BUCKET, 
    filename_upload
)

In [5]:
# LEI information - Level 1 data
# Data Source: https://www.gleif.org/en/lei-data/gleif-concatenated-file/download-the-concatenated-file
# Data Source size: 4.3 GB
# Data TSV select columns size: 161 MB

# pd.read_xml  breaks after 10min with "IO Encoder" Error
# ET.iterparse Takes 2min 21sec

filename = "C:/Users/phili/OneDrive/GitHub/Portfolio_Management/mapping/20220308-gleif-concatenated-file-lei2.xml"

# https://stackoverflow.com/questions/14924200/loading-huge-xml-files-and-dealing-with-memoryerror
# Stream xml

import xml.etree.ElementTree as ET

# Stream file
parser = ET.iterparse(filename)

# Create empty dictionary to store information
temp = []

r = 0
for event, element in parser:
    # LEI defines new data entry
    el = element.tag.split('}', 1)[1]
    if el == 'LEI':
        # Create new row
        temp.append({})
        r = r +  1

    if r != 0:
        temp[r-1][el] = element.text

    # then clean up
    element.clear()

df = pd.DataFrame(temp)

# Define relevant columns
cols = ['LEI', 'LegalName', 'City', 'Country', 'EntityStatus']

# Remove inactive entries? -> 4% are inactive

# Save as tsv
df[cols].to_csv("temp/LEI_info.tsv", sep = "\t") 



In [6]:
filename_upload = "config/mapping/lei_info.tsv"
filename_temp = "temp/lei_info.tsv"

s3.client.upload_file(
    filename_temp, 
    AWS_BUCKET, 
    filename_upload
)
