In [21]:
import os

# 1) Folders you _do_ want scanned:
INCLUDE_DIRS = [
    r"../portfolio-optimization/data/raw",
]

# 2) Any sub-folder names to completely ignore:
EXCLUDE_DIR_NAMES = {
    "benchmarks",
    "cash",
    "risk_free",
}

def find_excel_files(include_dirs, exclude_dir_names):
    """
    Walk each folder in include_dirs, but skip any subdirectory whose
    name appears in exclude_dir_names. Return a list of full paths
    to .xls/.xlsx files.
    """
    excel_paths = []
    for base in include_dirs:
        for root, dirs, files in os.walk(base):
            # In-place filter of `dirs` tells os.walk not to recurse into them:
            dirs[:] = [d for d in dirs if d not in exclude_dir_names]

            for fname in files:
                if fname.lower().endswith((".xls", ".xlsx")):
                    excel_paths.append(os.path.join(root, fname))
    return excel_paths

if __name__ == "__main__":
    all_excels = find_excel_files(INCLUDE_DIRS, EXCLUDE_DIR_NAMES)
    print(f"Found {len(all_excels)} Excel files:")
    for path in all_excels:
        print(" ", path)

Found 25 Excel files:
  ../portfolio-optimization/data/raw/bonds/em_bond_etf.xlsx
  ../portfolio-optimization/data/raw/bonds/jp_ult_sht_cor_bond_etf.xlsx
  ../portfolio-optimization/data/raw/bonds/ish_wld_cor_bond_etf.xlsx
  ../portfolio-optimization/data/raw/bonds/ubs_bbs_tips_bonds_etf.xlsx
  ../portfolio-optimization/data/raw/bonds/ish_glob_bond_etf.xlsx
  ../portfolio-optimization/data/raw/alternatives/gold_etf.xlsx
  ../portfolio-optimization/data/raw/alternatives/vici_reits_aim_etf.xlsx
  ../portfolio-optimization/data/raw/alternatives/aim_data_centr_etf.xlsx
  ../portfolio-optimization/data/raw/equities/min_vol/world_min_vol_etf.xlsx
  ../portfolio-optimization/data/raw/equities/min_vol/euro_min_vol_etf.xlsx
  ../portfolio-optimization/data/raw/equities/min_vol/snp_min_vol_etf.xlsx
  ../portfolio-optimization/data/raw/equities/min_vol/em_min_vol_etf.xlsx
  ../portfolio-optimization/data/raw/equities/us/spdr_world_tech.xlsx
  ../portfolio-optimization/data/raw/equities/us/amu_rai

In [22]:
import pandas as pd

def extract_tickers_from_files(excel_paths):
    """
    Given a list of Excel file paths, read each one and extract the values
    from its 'Ticker' column (if present). Return a dict mapping file→tickers.
    """
    file_to_tickers = {}

    for path in excel_paths:
        try:
            # Only read the Ticker column
            df = pd.read_excel(path, usecols=lambda c: c.strip().lower() == 'ticker', dtype=str)
        except ValueError:
            # raised if 'Ticker' column isn't found
            print(f"⚠️  No 'Ticker' column in {path!r}; skipping.")
            continue
        except Exception as e:
            print(f"⚠️  Could not read {path!r}: {e}")
            continue

        # Normalize, drop NaNs, dedupe
        tickers = df.iloc[:,0].dropna().str.strip().unique().tolist()
        if tickers:
            file_to_tickers[path] = tickers
        else:
            print(f"ℹ️  'Ticker' column in {path!r} is empty; skipping.")

    return file_to_tickers

if __name__ == "__main__":
    file_tickers = extract_tickers_from_files(all_excels)
    for file, ticks in file_tickers.items():
        print(f"{file}:")
        for t in ticks:
            print("   ", t)

../portfolio-optimization/data/raw/bonds/em_bond_etf.xlsx:
    EMHG
../portfolio-optimization/data/raw/bonds/jp_ult_sht_cor_bond_etf.xlsx:
    JGSA
../portfolio-optimization/data/raw/bonds/ish_wld_cor_bond_etf.xlsx:
    CRHG LN
../portfolio-optimization/data/raw/bonds/ubs_bbs_tips_bonds_etf.xlsx:
    UBTP
../portfolio-optimization/data/raw/bonds/ish_glob_bond_etf.xlsx:
    AGGG LN
../portfolio-optimization/data/raw/alternatives/gold_etf.xlsx:
    SGLN
../portfolio-optimization/data/raw/alternatives/vici_reits_aim_etf.xlsx:
    VICI
../portfolio-optimization/data/raw/alternatives/aim_data_centr_etf.xlsx:
    DLR
../portfolio-optimization/data/raw/equities/min_vol/world_min_vol_etf.xlsx:
    WMVG LN
../portfolio-optimization/data/raw/equities/min_vol/euro_min_vol_etf.xlsx:
    MVEU LN Equity
../portfolio-optimization/data/raw/equities/min_vol/snp_min_vol_etf.xlsx:
    SPMV LN
../portfolio-optimization/data/raw/equities/min_vol/em_min_vol_etf.xlsx:
    EMMV LN
../portfolio-optimization/da

In [26]:
import os
import pandas as pd

INCLUDE_DIRS = [
    r"../portfolio-optimization/data/raw",
]

# 2) Any sub-folder names to completely ignore:
EXCLUDE_DIR_NAMES = {
    "benchmarks",
    "cash",
    "risk_free",
}

def find_excel_files(include_dirs, exclude_dir_names):
    excel_paths = []
    for base in include_dirs:
        for root, dirs, files in os.walk(base):
            dirs[:] = [d for d in dirs if d not in exclude_dir_names]
            for fname in files:
                if fname.lower().endswith((".xls", ".xlsx")):
                    excel_paths.append(os.path.join(root, fname))
    return excel_paths

def build_master_df(excel_paths):
    records = []
    for path in excel_paths:
        try:
            df = pd.read_excel(path, dtype=str)
        except Exception as e:
            print(f"⚠️ Could not read {path}: {e}")
            continue

        # map lower‐cased header to actual
        cols = {c.strip().lower(): c for c in df.columns}
        if 'ticker' not in cols or 'description' not in cols:
            # skip files without both
            continue

        sub = df[[cols['ticker'], cols['description']]].copy()
        sub.columns = ['Ticker','Description']
        records.append(sub)

    if not records:
        return pd.DataFrame(columns=['Ticker','Description'])

    master = pd.concat(records, ignore_index=True)
    # keep last occurrence of each ticker
    master = master.drop_duplicates(subset='Ticker', keep='last')
    master['Ticker'] = master['Ticker'].str.strip().str.upper()
    return master.set_index('Ticker')

# Build everything once:
all_excels = find_excel_files(INCLUDE_DIRS, EXCLUDE_DIR_NAMES)
master_df = build_master_df(all_excels)

def get_descriptions(ticker_list):
    """
    ticker_list: list of strings
    returns a DataFrame with columns ['Ticker','Description'], preserving order,
    and using 'Not found' for missing tickers.
    """
    # normalize input
    clean = [t.strip().upper() for t in ticker_list]
    descs = []
    for t in clean:
        if t in master_df.index:
            descs.append(master_df.at[t, 'Description'])
        else:
            descs.append("<<Not found>>")
    return pd.DataFrame({'Ticker': clean, 'Description': descs})

# Example usage:
if __name__ == "__main__":
    tickers_to_lookup = [
    "EMHG",
    "JGSA",
    "UBTP",
    "SGLN",
    "WMVG LN",
    "TECW LN",
    "GOAIGBIV",
    "ARTGEIG LN",
    "PARAQPG LX",
    "CUKX LN",
    "VGSESIE ID",
]
    table = get_descriptions(tickers_to_lookup)
    print(table.to_markdown(index=False))

| Ticker     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
|:-----------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| EMHG       | ETF that aims to trac

In [27]:
import os
import pandas as pd

INCLUDE_DIRS = [
    r"../portfolio-optimization/data/raw",
]

EXCLUDE_DIR_NAMES = {
    "benchmarks",
    "cash",
    "risk_free",
}

def find_excel_files(include_dirs, exclude_dir_names):
    excel_paths = []
    for base in include_dirs:
        for root, dirs, files in os.walk(base):
            dirs[:] = [d for d in dirs if d not in exclude_dir_names]
            for fname in files:
                if fname.lower().endswith((".xls", ".xlsx")):
                    excel_paths.append(os.path.join(root, fname))
    return excel_paths

def build_master_df(excel_paths):
    """
    Scans each file for 'Ticker', 'Name' and 'Description' columns
    (case-insensitive), and builds one master DataFrame indexed by Ticker.
    """
    records = []
    for path in excel_paths:
        try:
            df = pd.read_excel(path, dtype=str)
        except Exception as e:
            print(f"⚠️ Could not read {path}: {e}")
            continue

        cols = {c.strip().lower(): c for c in df.columns}
        # require all three columns
        if not {'ticker', 'name', 'description'}.issubset(cols):
            continue

        sub = df[[cols['ticker'], cols['name'], cols['description']]].copy()
        sub.columns = ['Ticker', 'Name', 'Description']
        records.append(sub)

    if not records:
        return pd.DataFrame(columns=['Ticker','Name','Description']).set_index('Ticker')

    master = pd.concat(records, ignore_index=True)
    master = (
        master
        .drop_duplicates(subset='Ticker', keep='last')
        .assign(Ticker=lambda d: d['Ticker'].str.strip().str.upper())
        .set_index('Ticker')
    )
    return master

# Build once at import/run
all_excels = find_excel_files(INCLUDE_DIRS, EXCLUDE_DIR_NAMES)
master_df = build_master_df(all_excels)

def get_descriptions_and_names(ticker_list):
    """
    ticker_list: list of strings
    Returns a DataFrame with columns ['Ticker','Name','Description'],
    in the same order as the input list, filling '<<Not found>>'
    for any missing entries.
    """
    clean = [t.strip().upper() for t in ticker_list]
    rows = []
    for t in clean:
        if t in master_df.index:
            name = master_df.at[t, 'Name']
            desc = master_df.at[t, 'Description']
        else:
            name = "<<Not found>>"
            desc = "<<Not found>>"
        rows.append({'Ticker': t, 'Name': name, 'Description': desc})
    return pd.DataFrame(rows)

if __name__ == "__main__":
    tickers_to_lookup = [
        "EMHG",
        "JGSA",
        "UBTP",
        "SGLN",
        "WMVG LN",
        "TECW LN",
        "GOAIGBIV",
        "ARTGEIG LN",
        "PARAQPG LX",
        "CUKX LN",
        "VGSESIE ID",
    ]
    table = get_descriptions_and_names(tickers_to_lookup)
    print(table.to_markdown(index=False))

| Ticker     | Name                                                          | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
|:-----------|:--------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
import textwrap
from tabulate import tabulate

# Assume `table` is your DataFrame from get_descriptions_and_names(...)
# Wrap long text in each cell so it doesn’t run off the page
def wrap_col(col, width=40):
    return col.apply(lambda cell: "\n".join(textwrap.wrap(str(cell), width)))

wrapped = table.copy()
wrapped['Name']        = wrap_col(wrapped['Name'], width=30)
wrapped['Description'] = wrap_col(wrapped['Description'], width=50)

# Print a nice ASCII‐grid
print(tabulate(
    wrapped,
    headers=["Ticker","Full Fund Name","Rationale"],
    tablefmt="fancy_grid",
    showindex=False
))

╒════════════╤═══════════════════════════════╤════════════════════════════════════════════════════╕
│ Ticker     │ Full Fund Name                │ Rationale                                          │
╞════════════╪═══════════════════════════════╪════════════════════════════════════════════════════╡
│ EMHG       │ iShares J.P. Morgan USD EM    │ ETF that aims to track the performance of the J.P. │
│            │ Bond UCITS ETF (GBP hedged)   │ Morgan Emerging Bonds iNdex Global Core Index.     │
├────────────┼───────────────────────────────┼────────────────────────────────────────────────────┤
│ JGSA       │ JPMorgan GBP Ultra-Short      │ The JPMorgan GBP Ultra-Short Income UCITS ETF GBP  │
│            │ Income UCITS ETF GBP (Acc)    │ (Acc) is an actively managed ETF. The JP Morgan    │
│            │                               │ GBP Ultra Short Income Strategy invests primarily  │
│            │                               │ in investment grade, pound denominated, short term │


In [29]:
table.to_excel("fund_lookup.xlsx", index=False)
print("✅ Written fund_lookup.xlsx — open it in Excel.")

✅ Written fund_lookup.xlsx — open it in Excel.
