# Preprocess

This notebook handles pre-processing all the data to make sure it is consistent and there are no gaps in it.

Note: This notebook assumes that you have loaded data within the `data/portfolios.json` and `data/pricing` by running the `init.py`.


In [17]:
# Lib imports.
import pandas as pd
import os
import numpy as np

# Typing
from typing import Optional, Tuple

In [18]:
# Constants
DATA_PATH = "data/pricing"
FUNDS_PATH = "data/portfolios.json"

# Columns.
COLUMN_DATE = "asOfDate"
COLUMN_PRICE = "price"

## Definition

This section defines all the functions we need for later.


In [19]:
def load_data(path: str) -> Optional[pd.DataFrame]:
    """Loads the data as per the specification of the pricing CSVs.

    Args:
        path (str): to the current file to load.

    Returns:
        Optional[pd.DataFrame]: with the pricing data sorted by date, ascending. None if there is no data.
    """
    # Make sure it is a CSV and not something else.
    if ".csv" not in path:
        return None

    # Loads the data.
    df = pd.read_csv(
        path,
        header=0,
        index_col=COLUMN_DATE,
        parse_dates=[COLUMN_DATE],
        dtype={
            "price": np.float64,
            "currencyCode": np.string_,
            "__typename": np.string_,
        },
    )

    # Sort by the index that we defined as date.
    return df.sort_index(ascending=True, inplace=False)

In [20]:
def fill_gaps(df: pd.DataFrame) -> pd.DataFrame:
    """Fills the gaps in the pricing data by backfilling from the previous existing dates.

    Args:
        df (pd.DataFrame): with potential gaps in the date index.

    Returns:
        pd.DataFrame: with no gaps in the index.
    """
    # Get all the possible dates.
    date_min, date_max = (df.index.min(), df.index.max())

    # We reindex using the min and max dates. This will propagate the values.
    return df.reindex(pd.date_range(start=date_min, end=date_max), method="ffill")

## Testing

This section tests the functions defined above.

In [21]:
def evaluate_index_size(df: pd.DataFrame) -> Tuple[int, int]:
    # We can expect the number of rows to be less than a range of dates.
    full_range_len = len(pd.date_range(start=df.index.min(), end=df.index.max()))
    index_len = df.index.shape[0]

    print(f"Full Size vs Index Size = {full_range_len} vs {index_len}")
    return full_range_len, index_len

In [22]:
EXAMPLE_DATA = "data/pricing/8617.csv"

df = load_data(EXAMPLE_DATA)

# We can verify that there a gap since the gaps occur in weekends where the market is closed.
print(df)

# Make sure we assert it.
full_size, index_size = evaluate_index_size(df)
assert full_size > index_size

               price currencyCode __typename
asOfDate                                    
2016-11-08  100.0000          GBP  FundPrice
2016-11-09   99.7742          GBP  FundPrice
2016-11-10   99.7799          GBP  FundPrice
2016-11-11   98.5293          GBP  FundPrice
2016-11-14   99.3307          GBP  FundPrice
...              ...          ...        ...
2024-10-07  218.3068          GBP  FundPrice
2024-10-08  218.1539          GBP  FundPrice
2024-10-09  219.3834          GBP  FundPrice
2024-10-10  219.6460          GBP  FundPrice
2024-10-11  220.5142          GBP  FundPrice

[2002 rows x 3 columns]
Full Size vs Index Size = 2895 vs 2002


In [23]:
df = fill_gaps(df)

# We can verify that now there shouldn't be any gaps.
print(df)

# Make sure we assert it.
full_size, index_size = evaluate_index_size(df)
assert full_size == index_size

               price currencyCode __typename
2016-11-08  100.0000          GBP  FundPrice
2016-11-09   99.7742          GBP  FundPrice
2016-11-10   99.7799          GBP  FundPrice
2016-11-11   98.5293          GBP  FundPrice
2016-11-12   98.5293          GBP  FundPrice
...              ...          ...        ...
2024-10-07  218.3068          GBP  FundPrice
2024-10-08  218.1539          GBP  FundPrice
2024-10-09  219.3834          GBP  FundPrice
2024-10-10  219.6460          GBP  FundPrice
2024-10-11  220.5142          GBP  FundPrice

[2895 rows x 3 columns]
Full Size vs Index Size = 2895 vs 2895


## Propagation

Now we perform the propagation against ALL the data in our pricing.

In [27]:
files = [
    os.path.join(DATA_PATH, file)
    for file in os.listdir(DATA_PATH)
    if os.path.isfile(os.path.join(DATA_PATH, file)) and ".csv" in file
]

print(files)

['data/pricing/E059.csv', 'data/pricing/9894.csv', 'data/pricing/9670.csv', 'data/pricing/9664.csv', 'data/pricing/8618.csv', 'data/pricing/9506.csv', 'data/pricing/9507.csv', 'data/pricing/8619.csv', 'data/pricing/9665.csv', 'data/pricing/9671.csv', 'data/pricing/9659.csv', 'data/pricing/9317.csv', 'data/pricing/9471.csv', 'data/pricing/9129.csv', 'data/pricing/9667.csv', 'data/pricing/9673.csv', 'data/pricing/9505.csv', 'data/pricing/9504.csv', 'data/pricing/9672.csv', 'data/pricing/9666.csv', 'data/pricing/9470.csv', 'data/pricing/9662.csv', 'data/pricing/9676.csv', 'data/pricing/9501.csv', 'data/pricing/9677.csv', 'data/pricing/9688.csv', 'data/pricing/9461.csv', 'data/pricing/9477.csv', 'data/pricing/E048.csv', 'data/pricing/9675.csv', 'data/pricing/9107.csv', 'data/pricing/9661.csv', 'data/pricing/9503.csv', 'data/pricing/8620.csv', 'data/pricing/9502.csv', 'data/pricing/9106.csv', 'data/pricing/9660.csv', 'data/pricing/9674.csv', 'data/pricing/9476.csv', 'data/pricing/E006.csv',

In [None]:
# Do the processing against all the files.
for file in files:
    # Make sure to load the data.
    df = load_data(file)
    if df is None:
        continue

    # Fill the gaps.
    df = fill_gaps(df)

    # Write it back.
    df.to_csv(file, index_label=COLUMN_DATE)