In [50]:
import pandas as pd
import re

In [51]:
# Provided data: List of MT103 messages
mt103_messages = [
    """
    {1:F01MYMBGB2L0XXX0000000000}{2:I103HBUKGB4BXXXN}{3:{108:MT103
    0001}}{4:
    :20:MT103 0001
    :23B:CRED
    :32A:210322USD5000,
    :50K:/DE98765432101234567890
    COMMERZBANK AG
    HAMBURG, GERMANY
    /COBADEHHXXX
    :52A:/COBADEHHXXX
    COMMERZBANK AG
    HAMBURG, GERMANY
    :53A:/MYMBGB2LXXX
    METRO BANK PLC
    LONDON, UNITED KINGDOM
    :57A:/HBUKGB4BXXX
    HSBC BANK PLC
    LONDON, UNITED KINGDOM
    :59:/GB57METR12345678901234
    NORDFISCH GMBH
    BODENSEE STR. 226
    22761 HAMBURG
    GERMANY
    :71A:OUR
    :71F:/BIC/HBUKGB4BXXX
    :71G:/INS/THIS IS A PAYMENT FOR TUNA SUPPLY
    -}
    """,
    # Add more MT103 messages here
]

In [52]:
# Define a function to extract values from the MT103 message
def extract_value(tag, message):
    regex_pattern = r'(?<=:' + tag + ':)(.*?)(?=\n|\Z)'
    match = re.search(regex_pattern, message)
    if match:
        return match.group(1).strip()
    else:
        return ''

In [53]:
# Create an empty DataFrame with columns as per the provided schema
columns = [
    "transaction_date", 
    "transaction_id",
    "transaction_message", 
    "transaction_currency",
    "transaction_amount",
    "transaction_type", 
    "transaction_direction",
    "transaction_status",
    "instrument_type", 
    "originator_full_name",
    "originator_first_name",
    "originator_account_number",
    "originator_middle_names_patronymic",
    "originator_last_name", 
    "originator_address",
    "originator_country",
    "originator_account_number",
    "originator_branch_id",
    "originator_bic",
    "originator_fi_name",
    "originator_fi_country",
    "incoming_intermediary_fi_bic",
    "outgoing_intermediary_fi_bic",
    "beneficiary_full_name",
    "beneficiary_first_name",
    "beneficiary_middle_name_patronymic",
    "beneficiary_last_name",
    "beneficiary_address",
    "beneficiary_country",
    "beneficiary_account_number",
    "beneficiary_branch_id",
    "beneficiary_bic",
    "beneficiary_fi_name",
    "beneficiary_fi_country",     
]

empty_df = pd.DataFrame(columns=columns)

In [54]:
import pandas as pd

# Assuming mt103_messages is a list of MT103 message strings and extract_value is a predefined function
dfs = []
for mt103_message in mt103_messages:
    # Extract values from the MT103 message
    transaction_id = extract_value('20', mt103_message)
    originator_field = extract_value('50K', mt103_message)
    originator_account_number, *originator_details = originator_field.split('\n')
    originator_full_name = '\n'.join(originator_details[:-2]) if len(originator_details) > 2 else ""
    originator_address = originator_details[-2] if len(originator_details) > 1 else ""
    originator_country = originator_details[-1] if originator_details else ""
    originator_bic = extract_value('52A', mt103_message)
    originator_fi_name = extract_value('52A', mt103_message).split('\n')[1] if len(extract_value('52A', mt103_message).split('\n')) > 1 else ""
    beneficiary_account_number = extract_value('59', mt103_message)
    beneficiary_bic = extract_value('57A', mt103_message)
    transaction_type = extract_value('23B', mt103_message)
    transaction_amount = extract_value('32A', mt103_message).split('USD')[1].replace(',', '')
    transaction_currency = 'USD'  # Hardcoded as USD
    transaction_message = extract_value('71G', mt103_message)
    transaction_direction = None
    transaction_status = None
    instrument_type = None
    originator_first_name = None
    originator_middle_names_patronymic = None
    originator_last_name = None
    riginator_branch_id = None
    originator_fi_country = None
    incoming_intermediary_fi_bic = None
    outgoing_intermediary_fi_bic = None
    beneficiary_full_name = None
    beneficiary_first_name = None
    beneficiary_middle_name_patronymic = None
    beneficiary_last_name = None
    beneficiary_address = None
    beneficiary_country = None
    beneficiary_branch_id = None
    beneficiary_fi_name = None
    beneficiary_fi_country = None

    # Extract and format the transaction date from the :32A: field
    date_str = extract_value('32A', mt103_message)[:6]  # Assuming the date is always at the start
    transaction_year = int(date_str[:2]) + 2000  # Adjust the century as needed
    transaction_month = int(date_str[2:4])
    transaction_day = int(date_str[4:6])
    transaction_date = f"{transaction_day:02d}-{transaction_month:02d}-{transaction_year}"

    # Create a dictionary with extracted values
    data = {
    "transaction_date": transaction_date,
    "transaction_id": transaction_id,
    "transaction_message": transaction_message,
    "transaction_currency": transaction_currency,
    "transaction_amount": transaction_amount,
    "transaction_type": transaction_type,
    "transaction_direction": transaction_direction,  
    "transaction_status": transaction_status,  
    "instrument_type": instrument_type,  
    "originator_full_name": originator_full_name,
    "originator_first_name": originator_first_name,  
    "originator_middle_names_patronymic": originator_middle_names_patronymic,  
    "originator_last_name": originator_last_name, 
    "originator_address": originator_address,
    "originator_country": originator_country,
    "originator_account_number": originator_account_number,
    "originator_branch_id": riginator_branch_id,  
    "originator_bic": originator_bic,
    "originator_fi_name": originator_fi_name,
    "originator_fi_country": originator_fi_country,  
    "incoming_intermediary_fi_bic": incoming_intermediary_fi_bic,  
    "outgoing_intermediary_fi_bic": outgoing_intermediary_fi_bic,  
    "beneficiary_full_name": beneficiary_full_name,  
    "beneficiary_first_name": beneficiary_first_name,  
    "beneficiary_middle_name_patronymic": beneficiary_middle_name_patronymic,  
    "beneficiary_last_name": beneficiary_last_name,  
    "beneficiary_address": beneficiary_address,  
    "beneficiary_country": beneficiary_country,  
    "beneficiary_account_number": beneficiary_account_number,
    "beneficiary_branch_id": beneficiary_branch_id,  
    "beneficiary_bic": beneficiary_bic,
    "beneficiary_fi_name": beneficiary_fi_name,  
    "beneficiary_fi_country": beneficiary_fi_country, 
    }
    

    # Create DataFrame from dictionary
    df = pd.DataFrame([data])  # Simplified DataFrame creation
    dfs.append(df)

# Concatenate all DataFrames
result_df = pd.concat(dfs, ignore_index=True)


In [55]:
result_df.columns

Index(['transaction_date', 'transaction_id', 'transaction_message',
       'transaction_currency', 'transaction_amount', 'transaction_type',
       'transaction_direction', 'transaction_status', 'instrument_type',
       'originator_full_name', 'originator_first_name',
       'originator_middle_names_patronymic', 'originator_last_name',
       'originator_address', 'originator_country', 'originator_account_number',
       'originator_branch_id', 'originator_bic', 'originator_fi_name',
       'originator_fi_country', 'incoming_intermediary_fi_bic',
       'outgoing_intermediary_fi_bic', 'beneficiary_full_name',
       'beneficiary_first_name', 'beneficiary_middle_name_patronymic',
       'beneficiary_last_name', 'beneficiary_address', 'beneficiary_country',
       'beneficiary_account_number', 'beneficiary_branch_id',
       'beneficiary_bic', 'beneficiary_fi_name', 'beneficiary_fi_country'],
      dtype='object')