In [45]:
import pandas as pd
import sqlite3
from datetime import datetime
import os

In [46]:
# First, let's see where Python is looking
print("Current working directory:")
print(os.getcwd())

os.chdir(os.path.expanduser("~"))
os.chdir("Documents/2_PP/WM/portfolio-optimization/data/raw/")

# Let's see what files are in your current folder
print("\nFiles in current directory:")
print(os.listdir("."))

Current working directory:
/Users/nachogutierrezdelaroza/Documents/2_PP/WM/portfolio-optimization/data/raw

Files in current directory:
['.DS_Store', 'risk_free', 'bonds', 'cash', 'benchmarks', 'alternatives', 'equities']


In [47]:
file_path = "equities/min_vol/euro_min_vol_etf.xlsx"
df = pd.read_excel(file_path)

print("Raw data:")
print(df.head())

print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

print("\nSummary statistics:")

Raw data:
        Date  Last Price                                               Name
0 2025-06-04        6.82  iNAV iShares Edge MSCI Europe Minimum Volatili...
1 2025-06-03        6.79                                                NaN
2 2025-06-02        6.83                                                NaN
3 2025-05-30        6.82                                                NaN
4 2025-05-29        6.80                                                NaN

Data types:
Date          datetime64[ns]
Last Price           float64
Name                  object
dtype: object

Missing values:
Date             0
Last Price       0
Name          1294
dtype: int64

Summary statistics:


In [48]:
# Finding excel files
def find_all_excel_files(root_folder):
    """
    Find all Excel files in all subfolders
    """
    excel_files = [] # Creates an empty list, at the start, which will be filled with the paths of the excel files found in the subfolders
    
    # Walk through all folders and subfolders
    for root, dirs, files in os.walk(root_folder):
        # Skip if path contains risk_free
        if "risk_free" in root:
            
            continue

        for file in files:
            if file.endswith(('.xlsx', '.xls')):
                full_path = os.path.join(root, file)
                excel_files.append(full_path)
    
    return excel_files

# Find all your Excel files
root_folder = "."  # Adjust this path to your data folder
excel_files = find_all_excel_files(root_folder)

print(f"Found {len(excel_files)} Excel files:")
for file in excel_files:
    print(f"  📄 {file}")


Found 32 Excel files:
  📄 ./bonds/em_bond_etf.xlsx
  📄 ./bonds/jp_ult_sht_cor_bond_etf.xlsx
  📄 ./bonds/ish_wld_cor_bond_etf.xlsx
  📄 ./bonds/ubs_bbs_tips_bonds_etf.xlsx
  📄 ./bonds/ish_glob_bond_etf.xlsx
  📄 ./benchmarks/ftse_balance_prices.xlsx
  📄 ./alternatives/gold_etf.xlsx
  📄 ./alternatives/vici_reits_aim_etf.xlsx
  📄 ./alternatives/aim_data_centr_etf.xlsx
  📄 ./equities/etf_comparison_results.xlsx
  📄 ./equities/uk10y_gilt.xlsx
  📄 ./equities/min_vol/world_min_vol_etf.xlsx
  📄 ./equities/min_vol/euro_min_vol_etf.xlsx
  📄 ./equities/min_vol/snp_min_vol_etf.xlsx
  📄 ./equities/min_vol/em_min_vol_etf.xlsx
  📄 ./equities/us/ish_na_eq_etf.xlsx
  📄 ./equities/us/spdr_world_tech.xlsx
  📄 ./equities/us/amu_rai_us_etf.xlsx
  📄 ./equities/em/hsbc_china_em_etf.xlsx
  📄 ./equities/em/artemis_em_etf.xlsx
  📄 ./equities/em/hsbc_frontier_etf.xlsx
  📄 ./equities/other/bnp_aqua_etf.xlsx
  📄 ./equities/other/gam_sus_em_etf.xlsx
  📄 ./equities/other/acs_esg_etf.xlsx
  📄 ./equities/other/bgf_sus_e

In [49]:
def create_master_dataframe(excel_files):
    """
    Create master DataFrame of all securities
    """
    master_df = None
    
    for file_path in excel_files:
        try:
            print(f"Processing: {file_path}")
            
            # Read the Excel file
            df = pd.read_excel(file_path)
            
            # Get the single security name | dropna() makes the name the only value in the column
            security_name = df['Name'].dropna().iloc[0]

            print(f"Found security: {security_name}")

            # Prepare the dataframe for the master dataframe
            clean_df = df[['Date', 'Last Price']].copy()
            clean_df = clean_df.rename(columns={'Last Price': security_name})
            clean_df = clean_df.dropna()

            # Covert date column to datetime
            clean_df['Date'] = pd.to_datetime(clean_df['Date'], dayfirst=True)

            # First file: create master dataframe
            if master_df is None:
                master_df = clean_df
                print(f"Created master with Date column and: {security_name}")
            else:
                # Subsequent files: drop the Date column and add the new security
                price_series = clean_df.set_index('Date')[security_name]
                price_series.name = security_name

                # Add to master dataframe
                master_df = master_df.set_index('Date')
                master_df[security_name] = price_series
                master_df = master_df.reset_index()

                print(f"Added {security_name} to master")
    
        except Exception as e:
            print(f"Error with {file_path}: {e}")

    # Sort by date and reset index 
    master_df = master_df.sort_values(by='Date').reset_index(drop=True)
    
    return master_df

# Create your master dataframe
master_df = create_master_dataframe(excel_files)

print(f"\n🎉 Master DataFrame created!")
print(f"Shape: {master_df.shape[0]} rows, {master_df.shape[1]} columns")
print(f"Columns: {list(master_df.columns)}")
print("\nFirst few rows:")
print(master_df.head())


Processing: ./bonds/em_bond_etf.xlsx
Found security: iShares J.P. Morgan USD EM Bond UCITS ETF (GBP hedged)
Created master with Date column and: iShares J.P. Morgan USD EM Bond UCITS ETF (GBP hedged)
Processing: ./bonds/jp_ult_sht_cor_bond_etf.xlsx
Found security: JPMorgan GBP Ultra-Short Income UCITS ETF GBP (Acc)
Added JPMorgan GBP Ultra-Short Income UCITS ETF GBP (Acc) to master
Processing: ./bonds/ish_wld_cor_bond_etf.xlsx
Found security: iShares Global Corp Bond UCITS ETF GBP Hedged
Added iShares Global Corp Bond UCITS ETF GBP Hedged to master
Processing: ./bonds/ubs_bbs_tips_bonds_etf.xlsx
Found security: BBG TIPS 1-10 UCITS ETF
Added BBG TIPS 1-10 UCITS ETF to master
Processing: ./bonds/ish_glob_bond_etf.xlsx
Found security: iShares Core Global Aggregate Bond UCITS ETF
Added iShares Core Global Aggregate Bond UCITS ETF to master
Processing: ./benchmarks/ftse_balance_prices.xlsx
Error with ./benchmarks/ftse_balance_prices.xlsx: 'Name'
Processing: ./alternatives/gold_etf.xlsx
Foun

In [50]:
# Save to the processed folder
output_path = os.path.join("..", "processed", "master_securities_database.xlsx")
master_df.to_excel(output_path, index=False)

print(f"✅ Master DataFrame saved to: {output_path}")
print(f"Shape: {master_df.shape[0]} rows, {master_df.shape[1]} columns")
print(f"File size: {master_df.memory_usage(deep=True).sum() / 1024:.1f} KB")


✅ Master DataFrame saved to: ../processed/master_securities_database.xlsx
Shape: 1276 rows, 31 columns
File size: 309.2 KB
