# Outside the SDMX garden, looking at LEI and GLEIF

<img src="images/diagram_data.png" width="800" height="500"/>
<img src="images/use_case_diagram_start.png" width="800" height="500"/>



## Data cleaning and set up using pandas and pysdmx

<img src="images/diagram_first_step.png" width="800" height="500"/>


(Check the full notebook for the code to download the data)

In [None]:
#2. Read the downloaded data with Pandas
import pandas as pd

file_name = '20250125-1600-gleif-goldencopy-lei2-golden-copy.csv'

# We will read only the first 10000 rows as a sample
data = pd.read_csv('data_files/' + file_name, dtype=str, nrows=10000)
display(data)

In [None]:
#3. Drop the columns not used in the DSD and rename the existing ones

RENAME_DICT = {
    "LEI": "LEI",
    "Entity.LegalName": "LEGAL_NAME",
    "Entity.LegalAddress.Country": "COUNTRY_INCORPORATION",
    "Entity.HeadquartersAddress.Country": "COUNTRY_HEADQUARTERS",
    "Entity.EntityCategory": "CATEGORY",
    "Entity.EntitySubCategory": "SUBCATEGORY",
    "Entity.LegalForm.EntityLegalFormCode": "LEGAL_FORM",
    "Entity.EntityStatus": "STATUS",
    "Entity.LegalAddress.PostalCode": "POSTAL_CODE",
}

data.rename(columns=RENAME_DICT, inplace=True)
data = data[list(RENAME_DICT.values())]

# 4. Data filtering by status
data = data[data['STATUS'] == 'ACTIVE'].reset_index(drop=True)
del data['STATUS']
display(data)

# Retrieving the Schema from FMR and generate the Dataset
<img src="images/diagram_second_step.png" width="800" height="500"/>


In [None]:
from pysdmx.api.fmr import RegistryClient
from pysdmx.io.format import StructureFormat
from pysdmx.io.pd import PandasDataset

client = RegistryClient(
    "https://fmr.meaningfuldata.eu/sdmx/v2", format=StructureFormat.FUSION_JSON
)
# Recommend to use debugger to see the response
schema = client.get_schema(
    "datastructure", agency="MD", id="LEI_DATA", version="1.0"
)
# Generate the PandasDataset
dataset = PandasDataset(structure=schema, data=data)
schema.short_urn

# Structural validation using FMR

In [None]:
# Code to validate the dataset on FMR
from utils import validate_data_fmr
from pysdmx.io.csv.sdmx20.writer import write

# Serialization on SDMX-CSV 2.0
csv_text = write([dataset])

# Validate using FMR
result = validate_data_fmr(csv_text, host="fmr.meaningfuldata.eu", port=443,
                           use_https=True)
result

# Using VTL to validate the data with GLEIF data quality checks

<img src="images/diagram_third_step.png" width="800" height="500"/>


## Validate the data using VTL

(See validations VTL Script)

Running the VTL script

In [None]:
from utils import _load_script, run_vtl

script = _load_script("vtl/validations.vtl")
validations_result = run_vtl(script=script, dataset=dataset)

Getting the total number of errors

In [None]:
validations_result['errors_count'].data

Analysing data on Postal Code errors

In [None]:
cols_to_analyse = ['LEI','COUNTRY_INCORPORATION', 'POSTAL_CODE', 'errorcode',
                   'errorlevel']
display(validations_result['validation.postal_codes_errors'].data[cols_to_analyse])

Filtering the data to remove the errors

In [None]:
leis_to_delete = validations_result['validation.postal_codes_errors'].data['LEI']
valid_data = dataset.data[~dataset.data['LEI'].isin(leis_to_delete)].reset_index(drop=True)
print(f"Number of LEIs removed: {len(leis_to_delete)}")
print(f"Number of LEIs remaining: {len(valid_data)}")

## Using VTL to perform calculations

<img src="images/diagram_fourth_step.png" width="800" height="500"/>


(See calculations.vtl)

Running the VTL script on all data

In [None]:
script = _load_script("vtl/calculations.vtl")
all_dataset = PandasDataset(structure=schema, data=pd.read_csv('data_files/golden_copy_changed.csv', dtype=str))

calculations_result = run_vtl(script=script, dataset=all_dataset)
display(calculations_result['lei_statistics'].data)

# Diagram on aggregated statistics

In [None]:
import matplotlib.pyplot as plt

df = data[data['TYPE_MEASURE'] == 'NUMBER_ENTITIES_DIFF_HQ']
df = df.astype({'OBS_VALUE': int})

plt.figure()
df.nlargest(columns='OBS_VALUE', n=20).plot(x='COUNTRY', y='OBS_VALUE', kind='bar')
plt.show()

The countries with the most LEIs with the HQ in a different country
than the incorporation is Cayman Islands, Virgin Islands and Luxembourg

# Generate PandasDataset with the aggregated data

In [None]:
from pysdmx.io import read_sdmx
from pysdmx.io.pd import PandasDataset

# Reading SDMX-ML 2.1 Message
msg = read_sdmx(
    "https://fmr.meaningfuldata.eu/sdmx/v2/structure/datastructure/MD/LEI_AGGREGATE_STATISTICS/+/?format=sdmx-2.1&references=descendants&prettyPrint=true")

# Extracting DataStructureDefinition MD:LEI_AGGREGATE_STATISTICS(1.0)
dsd = msg.get_data_structure_definition(
    "DataStructure=MD:LEI_AGGREGATE_STATISTICS(1.0)")
schema_aggregated = dsd.to_schema()

# Generating the dataset
data = calculations_result['lei_statistics'].data
pd_dataset = PandasDataset(structure=schema_aggregated, data=data)
display(pd_dataset.data)

# Write the SDMX-ML 2.1 file

<img src="images/diagram_fifth_step.png" width="800" height="500"/>


In [None]:
from pysdmx.io.xml.sdmx21.writer.structure_specific import write

xml_str = write([pd_dataset], prettyprint=True)

print(xml_str)