In [1]:
import re
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import seaborn as sns
from openai import OpenAI
import openai
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pickle

In [2]:
def load_json_file(filepath):
    """Load and return a JSON file"""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {filepath}: {str(e)}")
        return None

In [3]:
Reject_id = []
Accept_id = []

1. Get all the clients with consistent Information

In [4]:
def get_valid_clients(path):
    """Get list of valid clients from validation results file"""
    valid_clients_file = os.path.join(os.getcwd(), path)
    if not os.path.exists(valid_clients_file):
        print(f"Valid clients file not found: {valid_clients_file}")
        print("Please run account_valid.py first to generate this file.")
        return []
    
    with open(valid_clients_file, 'r') as f:
        return [line.strip() for line in f.readlines()]

root_dir = os.getcwd()
# Get list of valid clients
print("Looking for list of valid clients...")
valid_clients = get_valid_clients("validation_results_valid_clients.txt")
print(f"Found {len(valid_clients)} valid clients")
invalid_clients = get_valid_clients("validation_results_invalid_clients.txt")
print(f"Found {len(invalid_clients)} valid clients")

for invalid_id in invalid_clients:
    Reject_id.append(f'{invalid_id}:Reject')

client_paths = {}
for folder in os.listdir(root_dir):
    full_path = os.path.join(root_dir, folder)
    if os.path.isdir(full_path) and folder.startswith("datathon_part"):
        for client_id in valid_clients:
            client_path = os.path.join(full_path, client_id)
            if os.path.exists(client_path):
                client_paths[client_id] = client_path

print(f"Found {len(client_paths)} valid client directories")

Looking for list of valid clients...
Found 772 valid clients
Found 222 valid clients
Found 772 valid client directories


2. Filter again according to the valid passport number and birthdate

In [5]:
def extract_passport_data(passport_mrz: dict) -> dict:
    line1, line2 = passport_mrz[0], passport_mrz[1]
    line1_data = line1.split("<")[:5]
    middle_name = line1_data[-1].title()
    country_code = line1_data[1][:3]
    last_name = line1_data[1][3:].title()
    first_name = line1_data[3].title()
    line1_data = {"first_name": first_name, "middle_name": middle_name, "last_name": last_name, "country_code": country_code}

    line2_data = line2.replace("<", "")
    passport_number = line2_data[:9]
    country_code = line2_data[9:12]
    birth_date = line2_data[12:][:2] + "-" + line2_data[12:][2:4] + "-" + line2_data[12:][4:]
    if int(line2_data[12:][:2]) <= 10:
        birth_date = "20" + birth_date
    else:
        birth_date = "19" + birth_date
    line2_data = {"passport_number": passport_number, "country_code": country_code, "birth_date": birth_date}
    return line1_data, line2_data

In [6]:
valid_pp_paths = {}
for client_id, client_path in client_paths.items():
    profile = load_json_file(os.path.join(client_path, "client_profile.json"))
    passport = load_json_file(os.path.join(client_path, "passport.json"))
    line1_data, line2_data = extract_passport_data(passport['passport_mrz'])
    passport_number = line2_data['passport_number']
    birth_date = line2_data['birth_date']
    if passport_number != profile['passport_number'] or birth_date != profile['birth_date']:
        print(f'{client_id} does not have a valid passport')
        Reject_id.append(f'{client_id}:Reject')
    else:
        valid_pp_paths[client_id] = client_path

client_971 does not have a valid passport
client_371 does not have a valid passport
client_47 does not have a valid passport
client_117 does not have a valid passport
client_914 does not have a valid passport
client_180 does not have a valid passport
client_544 does not have a valid passport
client_776 does not have a valid passport
client_915 does not have a valid passport
client_747 does not have a valid passport
client_400 does not have a valid passport
client_294 does not have a valid passport
client_453 does not have a valid passport
client_250 does not have a valid passport
client_611 does not have a valid passport
client_618 does not have a valid passport
client_875 does not have a valid passport
client_617 does not have a valid passport
client_550 does not have a valid passport
client_738 does not have a valid passport
client_362 does not have a valid passport
client_570 does not have a valid passport
client_742 does not have a valid passport
client_986 does not have a valid pa

In [38]:
len(valid_pp_paths)

738

3. Use LLM to check the description and profile

In [39]:
# The Description to be ignored
os.makedirs("tmp", exist_ok=True)
IGNORE_FIELDS = {"Summary Note", "Client Summary"}

Des = []
Pro = []
labels = []
id = 1
chunk_num = 0
for client_id, client_path in valid_pp_paths.items():
    profile_path = os.path.join(client_path, "client_profile.json")
    label_path = os.path.join(client_path, "label.json")
    description_path = os.path.join(client_path, "client_description.json")

    descriptions = load_json_file(description_path)
    profile = load_json_file(profile_path)
    label = load_json_file(label_path)
    label = label['label']
    labels.append(label)
    for field in IGNORE_FIELDS:
            descriptions.pop(field, None)

    Des.append(descriptions)
    Pro.append(profile)
    if id % 30 == 0 or id == len(valid_pp_paths):#1000:
        chunk_num += 1
        with open(os.path.join("/Users/jerrychen/Desktop/datathon2025/tmp", f"embed_profile_{chunk_num}.json"), "w", encoding="utf-8") as f_out:
            json.dump(Pro, f_out, indent=2, ensure_ascii=False)
        with open(os.path.join("/Users/jerrychen/Desktop/datathon2025/tmp", f"embed_description_{chunk_num}.json"), "w", encoding="utf-8") as f_out:
            json.dump(Des, f_out, indent=2, ensure_ascii=False)

        Des = []
        Pro = []

    id += 1

In [40]:
chunk_num

25

In [8]:
client = OpenAI(
  base_url="https://api.deepseek.com",
  api_key="sk-63f10246d168487cab095f760d437431",
)

In [None]:
decisions = []
text = []
for idx in range(chunk_num):
    print(f"processing {idx+1} chunk")
    with open(f"/Users/jerrychen/Desktop/datathon2025/tmp/embed_profile_{idx+1}.json", "r", encoding="utf-8") as f:
        profile_template = f.read()

    with open(f"/Users/jerrychen/Desktop/datathon2025/tmp/embed_description_{idx+1}.json", "r", encoding="utf-8") as f:
        client_description = f.read()

    prompt = f"""
    There are serveral clients information in two json files. For each client, keep strictly the form and structure of client_profile.json in mind as a template. Do as if you were filling in information based on this template. If there's information missing, pass to the next attribute in the client_profile.json, but if there's any information you extract from client_description - Copy.json to fill in the template different from the info in the original client_profile.json file (in the sense that the text info mismatches or any numerical value format mismatch or any numerical value mismatch in client_description - Copy.json for example it's extremely important that 1.00 and 1 are NOT considered as a match!) then print only Reject. Otherwise print OK.
    For every client only given result once.
    client_profile.json:
    {profile_template}
    client_description.json:
    {client_description}
    """
    print("Generating ...")
    response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
            {
                "role": "user",
                "content": prompt
            }
    ],
    stream=False
    )
    print("Generate Complete")
    # text.append(response.choices[0].message.content)
    lines = response.choices[0].message.content.strip().split('\n')
    if idx != 24:
        max_hits = 30
    else:
        max_hits = 18
    hit = 0    
    i = 0
    while i < len(lines) and hit < max_hits:
        line = lines[i]

        if re.search(r'\d', line):
            j = i
            while j < len(lines):
                line_j = lines[j]
                if "OK" in line_j:
                    decisions.append("OK")
                    hit += 1
                    break
                elif "Reject" in line_j:
                    decisions.append("Reject")
                    hit += 1
                    break
                j += 1
        i += 1
    print(len(decisions))



processing 1 chunk
Generating ...
Generate Complete
30
processing 2 chunk
Generating ...
Generate Complete
60
processing 3 chunk
Generating ...
Generate Complete
90
processing 4 chunk
Generating ...
Generate Complete
120
processing 5 chunk
Generating ...
Generate Complete
150
processing 6 chunk
Generating ...
Generate Complete
180
processing 7 chunk
Generating ...
Generate Complete
210
processing 8 chunk
Generating ...
Generate Complete
240
processing 9 chunk
Generating ...
Generate Complete
270
processing 10 chunk
Generating ...
Generate Complete
300
processing 11 chunk
Generating ...
Generate Complete
330
processing 12 chunk
Generating ...
Generate Complete
360
processing 13 chunk
Generating ...
Generate Complete
390
processing 14 chunk
Generating ...
Generate Complete
420
processing 15 chunk
Generating ...
Generate Complete
450
processing 16 chunk
Generating ...
Generate Complete
480
processing 17 chunk
Generating ...
Generate Complete
510
processing 18 chunk
Generating ...
Generate

In [14]:
ok_idx = []
# reject+idx = []
for i in range(len(decisions)):
    decision = decisions[i]
    if decision == 'OK':
        ok_idx.append(i)
    if decision == 'Reject':
        Reject_id.append(i)

In [24]:
len(valid_pp_paths)

738

In [28]:
i = 0
ok_client_id = []
for client_id, client_path in valid_pp_paths.items():
    if i >= len(decisions):
        break
    decision = decisions[i]
    if decision == 'OK':
        ok_client_id.append(client_id)
    if decision == 'Reject':
        Reject_id.append(f'{client_id}:Reject')
    i += 1

4. For rest clients, use random forest to predict

In [30]:
ok_client_paths = {}
for folder in os.listdir(root_dir):
    full_path = os.path.join(root_dir, folder)
    if os.path.isdir(full_path) and folder.startswith("datathon_evaluation"):
        for client_id in ok_client_id:
            client_path = os.path.join(full_path, client_id)
            if os.path.exists(client_path):
                ok_client_paths[client_id] = client_path

# print(f"Found {len(ok_client_paths)} valid client directories")

In [31]:
def flatten_profile(profile):
    flat = {
        'country_of_domicile': profile.get('country_of_domicile'),
        'birth_date': profile.get('birth_date'),
        'nationality': profile.get('nationality'),
        'gender': profile.get('gender'),
        'marital_status': profile.get('marital_status'),
        'investment_risk_profile': profile.get('investment_risk_profile'),
        'investment_horizon': profile.get('investment_horizon'),
        'investment_experience': profile.get('investment_experience'),
        'type_of_mandate': profile.get('type_of_mandate'),
        'currency': profile.get('currency'),
        'savings': profile.get('aum', {}).get('savings', 0),
        'inheritance': profile.get('aum', {}).get('inheritance', 0),
        'real_estate_value': profile.get('aum', {}).get('real_estate_value', 0),
        'num_preferred_markets': len(profile.get('preferred_markets', [])),
        'has_higher_education': int(bool(profile.get('higher_education'))),
        'employment_history': profile.get("employment_history", [])
    }
    # Calculate Age
    from datetime import datetime
    try:
        birth = datetime.strptime(flat['birth_date'], '%Y-%m-%d')
        flat['age'] = (datetime.today() - birth).days // 365
    except:
        flat['age'] = None

    if flat["employment_history"]:
        last_job = flat["employment_history"][-1]
        retired = last_job.get("end_year") is None
    else:
        retired = False
    
    flat['is_retired'] = retired

    del flat['birth_date']
    return flat

In [32]:
ok_profiles = []
ok_labels = []
ok_idx = []

for client_id, client_path in ok_client_paths.items():
    profile_path = os.path.join(client_path, "client_profile.json")
    label_path = os.path.join(client_path, "label.json")

    if not os.path.exists(profile_path):
        print(f"Warning: No client profile found for {client_id}")
        continue
    if not os.path.exists(label_path):
        print(f"Warning: No label file found for {client_id}")
        continue
    profile = load_json_file(profile_path)
    label = load_json_file(label_path)

    ok_profiles.append(flatten_profile(profile))
    ok_labels.append(label.get('label'))
    ok_idx.append(client_id)



In [33]:
ok_profile_pd = pd.DataFrame(ok_profiles)
ok_label_pd = pd.Series(ok_labels, name='label')

In [34]:
X, y = ok_profile_pd, ok_label_pd
# numeric_features = ['age', 'savings', 'inheritance', 'real_estate_value', 'num_preferred_markets']
# categorical_features = [col for col in X.columns if col not in numeric_features]

# preprocessor = ColumnTransformer([
#     ('num', SimpleImputer(strategy='median'), numeric_features),
#     ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
# ])
ordinal_features = ['investment_risk_profile', 'investment_horizon', 'investment_experience']
onehot_features = ['marital_status', 'type_of_mandate', 'has_higher_education', 'is_retired']
numeric_features = ['savings', 'inheritance', 'real_estate_value', 'age']

ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

onehot_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('ord', ordinal_pipeline, ordinal_features),
    ('ohe', onehot_pipeline, onehot_features),
    ('num', numeric_pipeline, numeric_features)
])

In [None]:
# classifier = RandomForestClassifier(
#     n_estimators=1000,          
#     max_features='sqrt',      
#     class_weight='balanced_subsample',   
#     random_state=42)
# model = make_pipeline(preprocessor, classifier)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# model.fit(X_train, y_train)

In [None]:
# with open("rf_model.pkl", "wb") as f:
#     pickle.dump(model, f)

In [35]:
with open("rf_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [36]:
y_pred = loaded_model.predict(X)
print(classification_report(y, y_pred))

TypeError: 'NoneType' object is not iterable

In [None]:
for i in range(len(y_pred)):
    if y_pred[i] == 'Accept':
        Accept_id.append(f'{ok_idx[i]}:Accept')
    elif y_pred[i] == 'Reject':
        Reject_id.append(f'{ok_idx[i]}:Reject')

In [37]:
Final_Decision = Accept_id + Reject_id
sorted_Final = sorted(Final_Decision, key=lambda x: int(x.split(";")[0].split("_")[1]))
with open("solution_test.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f, delimiter=";")
    for line in data:
        writer.writerow(line.split(";"))  

ValueError: invalid literal for int() with base 10: '724:Reject'