In [19]:
from src.base_data import ClientData, SPLIT_DATA_SPLIT1_DIR, load_split_data

In [20]:
train_data, val_data, test_data = load_split_data(SPLIT_DATA_SPLIT1_DIR)

Loaded 8000 training paths, 1000 validation paths, and 1000 test paths.


In [21]:
longest_field = ''
longest_field_key = ''
longest_field_len = 0

# explore the dicts of dicts, and get the longest string fields
for cd in train_data:
    # cd is a dataclass: get is as dict
    for key, val in cd.__dict__.items():
        if isinstance(val, dict):
            for k, v in val.items():
                if isinstance(v, str) and len(v) > longest_field_len:
                    longest_field = v
                    longest_field_key = k
                    longest_field_len = len(v)
        elif isinstance(val, str) and len(val) > longest_field_len:
            longest_field = val
            longest_field_key = key
            longest_field_len = len(val)

print(f'Longest field: {longest_field}\nWith length {longest_field_len}\nKey: {longest_field_key}')

Longest field: Having worked for over 22 years, Krol Van den Berg Jansen brings a wealth of experience and expertise to his current role. He is known for his strategic thinking and ability to drive results.
Krol Van den Berg Jansen's first role was as a Property Manager at CBRE Global Investors Nederland B.V., where he worked diligently from 2003 to 2008 to establish a strong reputation.
After gaining experience, he took on the role of Developer at Bouwinvest REIM B.V. in 2008, further expanding his skill set.
In 2014, he joined CBRE Global Investors Nederland B.V. as a Construction Company Owner, taking on new challenges and responsibilities that helped him grow both personally and professionally.
He appointment as a Real Estate Investment Trust (REIT) CEO at Unibail-Rodamco-Westfield Nederland B.V. in 2020 marked a significant milestone in his career, as he took on increasingly complex challenges and delivered exceptional results.
He has been a Landlord at Unibail-Rodamco-Westfield N

In [22]:
len(longest_field.split())

172

In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
# model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")

ret = tokenizer(longest_field, return_tensors='pt', padding=True)
type(ret)

transformers.tokenization_utils_base.BatchEncoding

In [46]:
ret['input_ids'].shape

torch.Size([1, 252])

In [25]:
ex = train_data[0]
ex.client_description

{'Summary Note': "Margaux Juliette Dubois and the RM met at a networking event in Paris. They quickly realized they had complementary skills and decided to collaborate on several projects.\nWith a career spanning nearly 37 years, Margaux Juliette Dubois has established herself as a key player in the industry.\nAttracted by Julius Baer's reputation for excellence, she chose to join for the bespoke financial solutions offered.\n",
 'Family Background': 'Margaux Juliette Dubois and Renault tied the knot in 1996. Together they have 3 kids called Capucine, Laurène and Lambert.',
 'Education Background': 'In 1982, Margaux graduated from Lycée Victor Hugo with a secondary school diploma.\nShe then attended University of Montpellier, which she graduated in 1986.\nMargaux earned her additionally degree from IUT de Strasbourg in 1992.\n',
 'Occupation History': "Margaux Juliette Dubois is a 61 year old and comes from France.\nMargaux Juliette Dubois's first role was as a Aerospace Engineer at Sa

In [26]:
def get_education(client_data: ClientData) -> str:
    description = client_data.client_description
    profile = client_data.client_profile

    # a. secondary school
    secondary_school = profile.get('secondary_school', {})
    school_name = secondary_school.get('name', 'N/A')
    grad_year = secondary_school.get('graduation_year', 'N/A')
    secondary_school_flat = f'Secondary education at {school_name} graduated in {grad_year}.'

    # b. higher education
    higher_edu = profile.get('higher_education', [])
    higher_edu_flat = ''
    for i, he in enumerate(higher_edu):
        uni_name = he.get('university', 'N/A')
        grad_year = he.get('graduation_year', 'N/A')
        if i > 0:
            higher_edu_flat += ' '
        higher_edu_flat += f'Higher education at {uni_name} graduated in {grad_year}.'

    # description
    description_edu = description.get('Education Background', '').strip()

    # total education
    return f"{secondary_school_flat} {higher_edu_flat} Description says: {description_edu}"

In [27]:
def get_employment(client_data: ClientData) -> str:
    profile = client_data.client_profile
    description = client_data.client_description

    # a. current employment
    employment = profile.get('employment_history', [])
    if not employment:
        employment_flat = "No employment history recorded."
    else:
        jobs = []
        for job in employment:
            start = job.get('start_year', 'N/A')
            end = job.get('end_year', 'present')
            company = job.get('company', 'N/A')
            position = job.get('position', 'N/A')
            salary = job.get('salary', 'N/A')
            jobs.append(f"Worked at {company} as {position} from {start} to {end}, earning {salary}.")
        employment_flat = ' '.join(jobs)

    # description employment
    description_emp = description.get("Occupation History", '').strip()

    return f"{employment_flat} Description says: {description_emp}"

In [28]:
def get_inheritance_profile(client_data: ClientData) -> str:
    profile = client_data.client_profile
    description = client_data.client_description

    aum = profile.get("aum", {})
    inheritance_amt = aum.get("inheritance", "N/A")

    inheritance_details = profile.get("inheritance_details", {})
    inh_year = inheritance_details.get("inheritance year", "N/A")
    inh_relation = inheritance_details.get("relationship", "N/A")
    inh_profession = inheritance_details.get("profession", "N/A")

    description_wealth = description.get("Wealth Summary", "").strip()

    return (
        f"Client inherited {inheritance_amt} DKK in {inh_year} from their {inh_relation}, "
        f"who was a {inh_profession}. Description says: {description_wealth}"
    )

In [29]:
def get_wealth_profile(client_data: ClientData) -> str:
    profile = client_data.client_profile
    description = client_data.client_description

    aum = profile.get("aum", {})
    savings = aum.get("savings", "N/A")
    real_estate_total = aum.get("real_estate_value", "N/A")

    real_estate_details = profile.get("real_estate_details", [])
    real_estate_flat = ""
    if real_estate_details:
        real_estate_parts = []
        for prop in real_estate_details:
            value = prop.get("property value", "N/A")
            ptype = prop.get("property type", "N/A")
            loc = prop.get("property location", "N/A")
            real_estate_parts.append(f"{ptype} in {loc} valued at {value}")
        real_estate_flat = " Properties include: " + "; ".join(real_estate_parts) + "."

    description_wealth = description.get("Wealth Summary", "").strip()

    return (
        f"Client has savings of {savings} DKK and total real estate value of {real_estate_total} DKK."
        f"{real_estate_flat} Description says: {description_wealth}"
    )

In [30]:
def get_investment_profile(client_data: ClientData) -> str:
    profile = client_data.client_profile
    description = client_data.client_description

    # a. investment risk profile
    risk = profile.get('investment_risk_profile', 'N/A')
    horizon = profile.get('investment_horizon', 'N/A')
    experience = profile.get('investment_experience', 'N/A')
    mandate = profile.get('type_of_mandate', 'N/A')
    markets = profile.get('preferred_markets', [])
    # flatten the markets list
    markets_flat = ', '.join(markets) if markets else 'N/A'

    # description investment
    description_investment = description.get("Client Summary", "").strip()

    return (f"Risk profile: {risk}. Horizon: {horizon}. Experience: {experience}. Mandate: {mandate}. Preferred markets: {markets_flat}. "
            f"Description says: {description_investment}")

In [31]:
def get_summary_note_profile(client_data: ClientData) -> str:
    profile = client_data.client_profile
    passport = client_data.passport
    description = client_data.client_description

    summary = description.get("Summary Note", "").strip()

    # Extract key facts to compare against structured fields
    birth_date = profile.get("birth_date", "N/A")
    country = profile.get("country_of_domicile", "N/A")
    nationality = profile.get("nationality", "N/A")
    name = profile.get("name", "N/A")

    passport_country = passport.get("country", "N/A")

    return (
        f"Summary note says: {summary} "
        f"Profile says: Name: {name}. Birth date: {birth_date}. "
        f"Domicile: {country}. Nationality: {nationality}. Passport country: {passport_country}."
    )

In [32]:
def get_family_background_profile(client_data: ClientData) -> str:
    profile = client_data.client_profile
    description = client_data.client_description

    family_bg = description.get("Family Background", "").strip()
    marital_status = profile.get("marital_status", "N/A")

    return (
        f"Family background says: {family_bg} "
        f"Profile marital status: {marital_status}."
    )

In [34]:
def flatten_fields(client_data: ClientData) -> list[str]:
    flattened = []

    # 1. Education
    flattened.append(get_education(client_data))

    # 2. Employment
    flattened.append(get_employment(client_data))

    # 3. Inheritance
    flattened.append(get_inheritance_profile(client_data))

    # 4. Wealth
    flattened.append(get_wealth_profile(client_data))

    # 5. Investment profile
    flattened.append(get_investment_profile(client_data))

    # 6. Summary note analysis
    flattened.append(get_summary_note_profile(client_data))

    # 7. Family background analysis
    flattened.append(get_family_background_profile(client_data))

    return flattened

In [54]:
fields = flatten_fields(train_data[0])
fields

['Secondary education at Lycée Victor Hugo graduated in 1982. Higher education at University of Montpellier graduated in 1986. Higher education at IUT de Strasbourg graduated in 1992. Description says: In 1982, Margaux graduated from Lycée Victor Hugo with a secondary school diploma.\nShe then attended University of Montpellier, which she graduated in 1986.\nMargaux earned her additionally degree from IUT de Strasbourg in 1992.',
 "Worked at Safran S.A. as Aerospace Engineer from 1988 to 1991, earning 33000. Worked at Airbus S.A.S. as Program Manager from 1993 to 1995, earning 47000. Worked at Safran S.A. as Director of Operations from 1996 to 1999, earning 67000. Worked at Safran S.A. as VP of Business Development from 2000 to 2024, earning 711000. Description says: Margaux Juliette Dubois is a 61 year old and comes from France.\nMargaux Juliette Dubois's first role was as a Aerospace Engineer at Safran S.A., where she worked diligently from 1988 to 1991 to establish a strong reputati

In [57]:
# run the tokenizer in all the fields to see how long they are
ret = tokenizer(fields, return_tensors='pt', padding=True)
print(f'{ret["input_ids"].shape = }\n\n')

for f in fields:
    ret = tokenizer(f, return_tensors='pt', padding=True)
    print(f'{ret["input_ids"].shape = }')

ret["input_ids"].shape = torch.Size([7, 239])


ret["input_ids"].shape = torch.Size([1, 92])
ret["input_ids"].shape = torch.Size([1, 239])
ret["input_ids"].shape = torch.Size([1, 128])
ret["input_ids"].shape = torch.Size([1, 158])
ret["input_ids"].shape = torch.Size([1, 47])
ret["input_ids"].shape = torch.Size([1, 126])
ret["input_ids"].shape = torch.Size([1, 42])


In [51]:
all_maxlen = 0
for i, cd in enumerate(train_data):
    fields = flatten_fields(cd)
    maxlen = 0
    tokens = tokenizer(fields, return_tensors='pt', padding=True)
    maxlen = max(maxlen, tokens['input_ids'].shape[1])
    print(f'Maxlen for example {i+1:5}: {maxlen}')
    all_maxlen = max(all_maxlen, maxlen)

print(f'All maxlen: {all_maxlen}')

Maxlen for example     1: 239
Maxlen for example     2: 155
Maxlen for example     3: 153
Maxlen for example     4: 138
Maxlen for example     5: 296
Maxlen for example     6: 138
Maxlen for example     7: 345
Maxlen for example     8: 126
Maxlen for example     9: 313
Maxlen for example    10: 143
Maxlen for example    11: 138
Maxlen for example    12: 187
Maxlen for example    13: 121
Maxlen for example    14: 144
Maxlen for example    15: 206
Maxlen for example    16: 138
Maxlen for example    17: 130
Maxlen for example    18: 199
Maxlen for example    19: 136
Maxlen for example    20: 205
Maxlen for example    21: 136
Maxlen for example    22: 169
Maxlen for example    23: 131
Maxlen for example    24: 233
Maxlen for example    25: 133
Maxlen for example    26: 129
Maxlen for example    27: 254
Maxlen for example    28: 150
Maxlen for example    29: 176
Maxlen for example    30: 221
Maxlen for example    31: 176
Maxlen for example    32: 209
Maxlen for example    33: 169
Maxlen for

In [52]:
all_maxlen = 0
for i, cd in enumerate(val_data):
    fields = flatten_fields(cd)
    maxlen = 0
    for f in fields:
        tokens = tokenizer(f, return_tensors='pt', padding=True)
        maxlen = max(maxlen, tokens['input_ids'].shape[1])
    print(f'Maxlen for example {i+1:5}: {maxlen}')
    all_maxlen = max(all_maxlen, maxlen)

print(f'All maxlen: {all_maxlen}')

Maxlen for example     1: 196
Maxlen for example     2: 124
Maxlen for example     3: 191
Maxlen for example     4: 129
Maxlen for example     5: 168
Maxlen for example     6: 291
Maxlen for example     7: 190
Maxlen for example     8: 250
Maxlen for example     9: 180
Maxlen for example    10: 237
Maxlen for example    11: 131
Maxlen for example    12: 135
Maxlen for example    13: 133
Maxlen for example    14: 209
Maxlen for example    15: 145
Maxlen for example    16: 177
Maxlen for example    17: 269
Maxlen for example    18: 180
Maxlen for example    19: 208
Maxlen for example    20: 286
Maxlen for example    21: 158
Maxlen for example    22: 326
Maxlen for example    23: 197
Maxlen for example    24: 298
Maxlen for example    25: 197
Maxlen for example    26: 199
Maxlen for example    27: 195
Maxlen for example    28: 353
Maxlen for example    29: 208
Maxlen for example    30: 288
Maxlen for example    31: 162
Maxlen for example    32: 144
Maxlen for example    33: 182
Maxlen for

In [53]:
all_maxlen = 0
for i, cd in enumerate(test_data):
    fields = flatten_fields(cd)
    maxlen = 0
    for f in fields:
        tokens = tokenizer(f, return_tensors='pt', padding=True)
        maxlen = max(maxlen, tokens['input_ids'].shape[1])
    print(f'Maxlen for example {i+1:5}: {maxlen}')
    all_maxlen = max(all_maxlen, maxlen)

print(f'All maxlen: {all_maxlen}')

Maxlen for example     1: 118
Maxlen for example     2: 125
Maxlen for example     3: 168
Maxlen for example     4: 254
Maxlen for example     5: 120
Maxlen for example     6: 186
Maxlen for example     7: 198
Maxlen for example     8: 126
Maxlen for example     9: 175
Maxlen for example    10: 119
Maxlen for example    11: 186
Maxlen for example    12: 239
Maxlen for example    13: 297
Maxlen for example    14: 171
Maxlen for example    15: 129
Maxlen for example    16: 124
Maxlen for example    17: 192
Maxlen for example    18: 141
Maxlen for example    19: 185
Maxlen for example    20: 186
Maxlen for example    21: 188
Maxlen for example    22: 135
Maxlen for example    23: 282
Maxlen for example    24: 297
Maxlen for example    25: 184
Maxlen for example    26: 302
Maxlen for example    27: 236
Maxlen for example    28: 174
Maxlen for example    29: 308
Maxlen for example    30: 218
Maxlen for example    31: 158
Maxlen for example    32: 359
Maxlen for example    33: 223
Maxlen for

In [None]:
def encode_all_data(client_data_list: list[ClientData], tokenizer, max_length=512):
    encoded_data = []
    for cd in client_data_list:
        fields = flatten_fields(cd)
        tokens = tokenizer(fields, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        encoded_data.append(tokens)
    return encoded_data