In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install spacy nltk
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import re
import json
import spacy
from datetime import datetime
import nltk
from nltk.tokenize import sent_tokenize
from nltk.chunk import RegexpParser
from nltk import pos_tag, word_tokenize
import os

In [4]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
class ClientDescriptionParser:
    """
    A class to parse client description text files into structured JSON format
    using natural language processing techniques with enhanced extraction capabilities.
    """

    def __init__(self):
        """Initialize the parser with NLP models."""
        # Load spaCy model
        self.nlp = spacy.load("en_core_web_sm")

        # Define common section headers that might appear in client descriptions
        self.section_patterns = {
            "summary": [r"summary note", r"summary", r"client overview", r"overview", r"general information", r"client information"],
            "family": [r"family background", r"family information", r"family", r"personal relationships", r"household"],
            "education": [r"education background", r"education", r"academic background", r"academic history", r"qualifications", r"studies"],
            "occupation": [r"occupation history", r"employment history", r"career", r"professional background", r"work history", r"job"],
            "wealth": [r"wealth summary", r"financial summary", r"assets", r"financial status", r"finances", r"economic situation", r"net worth"],
            "client_preferences": [r"preferences", r"interests", r"hobbies", r"likes", r"client interests", r"lifestyle"],
            "risk_profile": [r"risk profile", r"risk tolerance", r"investment profile", r"risk assessment", r"risk attitude"],
            "relationship_manager": [r"relationship manager", r"rm", r"advisor", r"client advisor", r"bank contact"],
            "client_summary": [r"client summary", r"conclusion", r"final notes", r"assessment", r"evaluation", r"summary"]
        }

    def _extract_sections(self, text):
        """
        Extract sections from the text based on common headers.
        Returns a dictionary with section names and their content.
        """
        sections = {}
        lines = text.strip().split('\n')

        # Preprocess to identify potential headers
        potential_headers = []
        for i, line in enumerate(lines):
            line = line.strip()
            if not line:
                continue

            # Look for typical header patterns
            if re.match(r'^[A-Za-z\s]+:$', line) or \
               (re.match(r'^[A-Za-z\s]+$', line) and (i == 0 or not lines[i-1].strip()) and \
               (i == len(lines)-1 or not lines[i+1].strip() or len(line) < 30)):
                potential_headers.append((i, line))

        # Process text based on identified headers
        if potential_headers:
            section_boundaries = []

            for i, (line_idx, header) in enumerate(potential_headers):
                # Determine section type
                section_match = False
                current_section = "unknown"

                for section_name, patterns in self.section_patterns.items():
                    for pattern in patterns:
                        if re.search(pattern, header.lower()):
                            current_section = section_name
                            section_match = True
                            break
                    if section_match:
                        break

                # If no match with known patterns, use the header as section name
                if not section_match:
                    current_section = header.lower().replace(':', '').strip()

                # Record section boundary
                if i < len(potential_headers) - 1:
                    next_header_idx = potential_headers[i+1][0]
                    section_boundaries.append((current_section, line_idx+1, next_header_idx))
                else:
                    section_boundaries.append((current_section, line_idx+1, len(lines)))

            # Extract content for each section
            for section_name, start, end in section_boundaries:
                section_content = '\n'.join(lines[start:end]).strip()
                sections[section_name] = section_content
        else:
            # Handle case with no headers - try to identify sections based on content
            current_section = "unknown"
            current_content = []

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Check if this line might indicate a section based on content
                section_match = False
                for section_name, patterns in self.section_patterns.items():
                    for pattern in patterns:
                        if re.search(pattern, line.lower()):
                            # If we've been building content for a previous section, save it
                            if current_content:
                                sections[current_section] = '\n'.join(current_content)
                                current_content = []
                            current_section = section_name
                            section_match = True
                            break
                    if section_match:
                        break

                # If it's not a section header, add to current content
                if not section_match:
                    current_content.append(line)

            # Add the last section
            if current_content:
                sections[current_section] = '\n'.join(current_content)

        # If still no sections identified, treat entire text as unknown section
        if not sections:
            sections["unknown"] = text

        return sections

    def _extract_personal_info(self, text):
        """Extract comprehensive personal information using NLP."""
        personal_info = {}
        doc = self.nlp(text)

        # Extract name and age using multiple patterns
        name_age_patterns = [
            r'([A-Za-z\s]+) is (\d+) years? old',
            r'([A-Za-z\s]+), (?:aged|age) (\d+)',
            r'(?:name is|client is) ([A-Za-z\s]+), (\d+)',
            r'([A-Za-z\s]+) \((\d+)\)'
        ]

        for pattern in name_age_patterns:
            name_match = re.search(pattern, text, re.IGNORECASE)
            if name_match:
                personal_info["name"] = name_match.group(1).strip()
                personal_info["age"] = int(name_match.group(2))
                break

        # If name wasn't found, try to extract using NER
        if "name" not in personal_info:
            for entity in doc.ents:
                if entity.label_ == "PERSON" and len(entity.text.split()) >= 2:
                    personal_info["name"] = entity.text
                    break

        # Extract nationality/country
        nationality_patterns = [
            r'from ([A-Za-z]+)',
            r'nationality (?:is|:) ([A-Za-z]+)',
            r'citizen of ([A-Za-z]+)',
            r'(?:born|raised) in ([A-Za-z]+)',
            r'([A-Za-z]+) national',
            r'(?:is|a) ([A-Za-z]+) citizen'
        ]

        for pattern in nationality_patterns:
            nationality_match = re.search(pattern, text, re.IGNORECASE)
            if nationality_match:
                personal_info["nationality"] = nationality_match.group(1).strip()
                break

        # If nationality wasn't found, try NER
        if "nationality" not in personal_info:
            for entity in doc.ents:
                if entity.label_ == "GPE" and len(entity.text) > 3:  # GPE = Geo-Political Entity
                    # Use the first country mentioned
                    personal_info["nationality"] = entity.text
                    break

        # Extract date of birth
        dob_patterns = [
            r'born on (\d{1,2})[\/\.\-](\d{1,2})[\/\.\-](\d{4})',
            r'date of birth:? (\d{1,2})[\/\.\-](\d{1,2})[\/\.\-](\d{4})',
            r'DOB:? (\d{1,2})[\/\.\-](\d{1,2})[\/\.\-](\d{4})',
            r'born in (\d{4})'
        ]

        for pattern in dob_patterns:
            dob_match = re.search(pattern, text, re.IGNORECASE)
            if dob_match:
                if len(dob_match.groups()) == 3:
                    personal_info["date_of_birth"] = f"{dob_match.group(1)}/{dob_match.group(2)}/{dob_match.group(3)}"
                else:
                    personal_info["year_of_birth"] = dob_match.group(1)
                break

        # Extract contact information
        contact_patterns = {
            "email": r'(?:email|e-mail):? ([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
            "phone": r'(?:phone|telephone|mobile|contact number):? (\+?[0-9\s\-().]{7,})',
            "address": r'(?:address|residence|lives in|living at):? ([^\.]+)'
        }

        for info_type, pattern in contact_patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                personal_info[info_type] = match.group(1).strip()

        # Extract languages spoken
        language_patterns = [
            r'speaks? (?:fluent )?([A-Za-z,\s]+) (?:and|&) ([A-Za-z]+)',
            r'language(?:s)? spoken:? ([^\.]+)',
            r'fluent in ([^\.]+)'
        ]

        for pattern in language_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                if len(match.groups()) == 2:
                    languages = f"{match.group(1)}, {match.group(2)}"
                else:
                    languages = match.group(1)
                personal_info["languages"] = [lang.strip() for lang in re.split(r',|and|&', languages)]
                break

        # Extract gender using NLP
        female_indicators = ['she', 'her', 'herself', 'woman', 'female', 'lady', 'Ms', 'Mrs', 'Miss']
        male_indicators = ['he', 'him', 'himself', 'man', 'male', 'gentleman', 'Mr']

        female_count = 0
        male_count = 0

        for token in doc:
            if token.text.lower() in [s.lower() for s in female_indicators]:
                female_count += 1
            elif token.text.lower() in [s.lower() for s in male_indicators]:
                male_count += 1

        # Also check for gender specific titles
        for female_title in ['Ms', 'Mrs', 'Miss']:
            if re.search(r'\b' + female_title + r'\b', text):
                female_count += 3  # Give more weight to titles

        if re.search(r'\bMr\b', text):
            male_count += 3  # Give more weight to titles

        if female_count > male_count:
            personal_info["gender"] = "Female"
        elif male_count > female_count:
            personal_info["gender"] = "Male"

        # Extract profession using various patterns
        profession_patterns = [
            r'is an? ([A-Za-z\s]+) (?:from|at|with)',
            r'works as an? ([A-Za-z\s]+)',
            r'is employed as an? ([A-Za-z\s]+)',
            r'profession (?:is|:) ([A-Za-z\s]+)',
            r'(?:career|job|occupation) as an? ([A-Za-z\s]+)'
        ]

        for pattern in profession_patterns:
            profession_match = re.search(pattern, text, re.IGNORECASE)
            if profession_match:
                # Clean up common issues with profession extraction
                profession = profession_match.group(1).strip()
                # Remove articles and common issues
                profession = re.sub(r'\b(a|an|the)\b', '', profession).strip()
                personal_info["profession"] = profession
                break

        # Extract relationship manager info using a separate function
        rm_info = self._extract_rm_info(text)
        if rm_info:
            personal_info["relationship_manager"] = rm_info

        # Extract client ID or account number if present
        id_patterns = [
            r'client (?:ID|id|number):? ([A-Za-z0-9-]+)',
            r'account (?:number|#):? ([A-Za-z0-9-]+)',
            r'reference (?:number|#):? ([A-Za-z0-9-]+)'
        ]

        for pattern in id_patterns:
            id_match = re.search(pattern, text, re.IGNORECASE)
            if id_match:
                personal_info["client_id"] = id_match.group(1).strip()
                break

        return personal_info

    def _extract_rm_info(self, text):
        """Extract detailed relationship manager information."""
        rm_info = {}

        # Extract relationship manager name and relation
        rm_patterns = [
            r'The RM is ([^\']+)\'s ([^,\.]+)',
            r'relationship manager is ([^\']+)\'s ([^,\.]+)',
            r'(?:assigned|appointed) (?:to|with) ([^\']+) as (?:the|their) ([^,\.]+)',
            r'(?:managed|handled) by ([^,\.]+)',
            r'RM (?:is|:) ([^,\.]+)'
        ]

        for pattern in rm_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                if len(match.groups()) == 2:
                    rm_info["name"] = match.group(1).strip()
                    rm_info["relation"] = match.group(2).strip()
                else:
                    rm_info["name"] = match.group(1).strip()
                break

        # Extract relation details
        relation_patterns = [
            r'having married ([^\.]+)',
            r'being ([^\.]+)'
        ]

        for pattern in relation_patterns:
            relation_match = re.search(pattern, text, re.IGNORECASE)
            if relation_match:
                rm_info["details"] = relation_match.group(1).strip()
                break

        # Extract relationship duration
        duration_pattern = r'(?:relationship|client) for (\d+) years'
        match = re.search(duration_pattern, text, re.IGNORECASE)
        if match:
            rm_info["relationship_duration"] = int(match.group(1))

        # Extract office/branch information
        office_pattern = r'(?:based in|office in|branch in|from) ([^,\.]+) (?:office|branch)'
        match = re.search(office_pattern, text, re.IGNORECASE)
        if match:
            rm_info["office_location"] = match.group(1).strip()

        return rm_info

    def _extract_family_info(self, text):
        """Extract detailed family information."""
        family_info = {}

        # Extract marital status using multiple patterns
        marital_patterns = [
            r'married to ([A-Za-z\s]+)',
            r'(?:husband|wife|spouse) is ([A-Za-z\s]+)',
            r'is ([A-Za-z\s]+)\'s (?:husband|wife|spouse)'
        ]

        for pattern in marital_patterns:
            marital_match = re.search(pattern, text, re.IGNORECASE)
            if marital_match:
                family_info["marital_status"] = "Married"
                family_info["spouse_name"] = marital_match.group(1).strip()
                break

        # Single status patterns
        if "marital_status" not in family_info:
            single_patterns = [r'is single', r'not married', r'unmarried']
            for pattern in single_patterns:
                if re.search(pattern, text, re.IGNORECASE):
                    family_info["marital_status"] = "Single"
                    break

            # Divorced status patterns
            divorced_patterns = [r'is divorced', r'divorced from']
            for pattern in divorced_patterns:
                if re.search(pattern, text, re.IGNORECASE):
                    family_info["marital_status"] = "Divorced"
                    break

            # Widowed status patterns
            widowed_patterns = [r'is widowed', r'widow', r'widower']
            for pattern in widowed_patterns:
                if re.search(pattern, text, re.IGNORECASE):
                    family_info["marital_status"] = "Widowed"
                    break

        # Extract children information with more patterns
        children_patterns = [
            r'(?:do not have any children|no children|does not have children|does not have any children)',
            r'has (\d+) (?:child|children|son|daughter|sons|daughters)',
            r'have (\d+) (?:child|children|son|daughter|sons|daughters)'
        ]

        for pattern in children_patterns:
            children_match = re.search(pattern, text, re.IGNORECASE)
            if children_match:
                if "do not" in children_match.group(0).lower() or "no children" in children_match.group(0).lower() or "does not" in children_match.group(0).lower():
                    family_info["has_children"] = False
                else:
                    family_info["has_children"] = True
                    family_info["number_of_children"] = int(children_match.group(1))
                break

        # Extract children's names and ages if available
        if family_info.get("has_children", False):
            children_details_pattern = r'(?:children are|children named|named) ([^\.]+)'
            children_details_match = re.search(children_details_pattern, text, re.IGNORECASE)
            if children_details_match:
                children_details = children_details_match.group(1).strip()
                children_list = []

                # Try to parse children with ages
                child_with_age_pattern = r'([A-Za-z]+) \((\d+)\)'
                child_with_age_matches = re.findall(child_with_age_pattern, children_details)

                if child_with_age_matches:
                    for name, age in child_with_age_matches:
                        children_list.append({"name": name.strip(), "age": int(age)})
                else:
                    # Just get names
                    children_names = [name.strip() for name in re.split(r',|and|&', children_details)]
                    children_list = [{"name": name} for name in children_names if name]

                if children_list:
                    family_info["children"] = children_list

        # Extract parents information
        parents_patterns = [
            r'(?:father|dad) is ([A-Za-z\s]+)',
            r'(?:mother|mom) is ([A-Za-z\s]+)'
        ]

        for pattern in parents_patterns:
            parent_match = re.search(pattern, text, re.IGNORECASE)
            if parent_match:
                parent_type = "father" if "father" in pattern or "dad" in pattern else "mother"
                if "parents" not in family_info:
                    family_info["parents"] = {}
                family_info["parents"][parent_type] = parent_match.group(1).strip()

        # Extract siblings information
        siblings_patterns = [
            r'(?:brother|sister)s? ([A-Za-z\s,]+)',
            r'has (\d+) (?:brother|sister)s?'
        ]

        for pattern in siblings_patterns:
            siblings_match = re.search(pattern, text, re.IGNORECASE)
            if siblings_match:
                if siblings_match.group(1).isdigit():
                    family_info["number_of_siblings"] = int(siblings_match.group(1))
                else:
                    family_info["siblings"] = [sib.strip() for sib in re.split(r',|and|&', siblings_match.group(1))]
                break

        return family_info

    def _extract_education_info(self, text):
        """Extract comprehensive education information using NLP."""
        education_info = []
        sentences = sent_tokenize(text)

        # Patterns for education information with more variations
        education_patterns = [
            r'([^.]+) from (.+?) in (\d{4})',
            r'educated at (.+?) in (\d{4})',
            r'graduated from (.+?)(?: in| with| year)? (\d{4})',
            r'completed (?:her|his|a) ([^.]+) (?:at|from) (.+?)(?: in| year)? (\d{4})',
            r'studied ([^.]+) at (.+?)(?: in| year)? (\d{4})',
            r'holds a ([^.]+) from (.+?)(?: |\()(\d{4})',
            r'(?:bachelor|master|phd|doctorate|degree) (?:in|of) ([^.]+) from (.+?)(?: in| year)? (\d{4})'
        ]

        # First pass: extract structured education info
        for sentence in sentences:
            for pattern in education_patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    education_entry = {}

                    # Different patterns have different group arrangements
                    if "bachelor" in pattern or "master" in pattern or "phd" in pattern:
                        # Pattern for degree types
                        field = match.group(1).strip()
                        institution = match.group(2).strip()
                        if len(match.groups()) > 2:
                            year = match.group(3).strip()
                            if year.isdigit():
                                year = int(year)
                        degree_type = pattern.split()[0].capitalize()  # Extract degree type from pattern
                    elif len(match.groups()) == 3:
                        # Standard pattern with degree, institution, year
                        degree_type = match.group(1).strip()
                        institution = match.group(2).strip()
                        year = match.group(3).strip()
                        if year.isdigit():
                            year = int(year)
                        field = ""
                    else:
                        # Pattern with just institution and year
                        institution = match.group(1).strip()
                        year = match.group(2).strip()
                        if year.isdigit():
                            year = int(year)
                        degree_type = ""
                        field = ""

                    education_entry["institution"] = institution
                    if year:
                        education_entry["year"] = year

                    # Add degree type and field if available
                    if degree_type:
                        education_entry["degree"] = degree_type
                    if field:
                        education_entry["field"] = field

                    # Determine education level
                    if any(term in (degree_type.lower() if degree_type else "") or term in sentence.lower() for term in
                           ["secondary", "high school", "diploma"]):
                        education_entry["level"] = "Secondary"
                    elif any(term in (degree_type.lower() if degree_type else "") or term in sentence.lower() for term in
                             ["university", "college", "tertiary", "bachelor", "master", "phd", "doctorate"]):
                        education_entry["level"] = "Tertiary"
                    else:
                        # Use NLP to try to determine level
                        doc = self.nlp(sentence)
                        for entity in doc.ents:
                            if entity.label_ == "ORG" and any(edu_term in entity.text.lower() for edu_term in
                                                            ["university", "college", "school"]):
                                if "university" in entity.text.lower() or "college" in entity.text.lower():
                                    education_entry["level"] = "Tertiary"
                                else:
                                    education_entry["level"] = "Secondary"

                    education_info.append(education_entry)

        # Second pass: look for simple mentions of education that might be missed
        if not education_info:
            simple_edu_patterns = [
                r'studied at ([^\.]+)',
                r'attended ([^\.]+)',
                r'education (?:at|from) ([^\.]+)'
            ]

            for sentence in sentences:
                for pattern in simple_edu_patterns:
                    match = re.search(pattern, sentence, re.IGNORECASE)
                    if match:
                        institution = match.group(1).strip()
                        # Try to determine if it's a university/college or school
                        if any(term in institution.lower() for term in ["university", "college"]):
                            education_info.append({
                                "institution": institution,
                                "level": "Tertiary"
                            })
                        elif any(term in institution.lower() for term in ["school", "academy"]):
                            education_info.append({
                                "institution": institution,
                                "level": "Secondary"
                            })
                        else:
                            education_info.append({
                                "institution": institution
                            })

        # Look for highest qualification mention
        highest_qual_pattern = r'highest (?:qualification|education|degree) (?:is|:) ([^\.]+)'
        for sentence in sentences:
            match = re.search(highest_qual_pattern, sentence, re.IGNORECASE)
            if match:
                # Add a special flag for highest qualification
                qualification = match.group(1).strip()
                # See if this qualification is already in our list
                found = False
                for entry in education_info:
                    if "degree" in entry and qualification.lower() in entry["degree"].lower():
                        entry["highest"] = True
                        found = True
                        break

                # If not found, add as a separate entry
                if not found:
                    level = "Tertiary" if any(term in qualification.lower() for term in
                                             ["bachelor", "master", "phd", "doctorate", "university", "college"]) else "Secondary"
                    education_info.append({
                        "degree": qualification,
                        "highest": True,
                        "level": level
                    })

        return education_info

    def _extract_wealth_info(self, text):
        """Extract wealth information."""
        wealth_info = {}

        # Extract savings
        savings_pattern = r'saving (\d+(?:,\d+)*) ([A-Za-z]{3})'
        savings_match = re.search(savings_pattern, text)
        if savings_match:
            amount_str = savings_match.group(1).replace(',', '')
            wealth_info["savings"] = {
                "amount": int(amount_str),
                "currency": savings_match.group(2)
            }

        # Extract property information
        property_pattern = r'(?:does not have any properties|has (\d+) properties)'
        property_match = re.search(property_pattern, text, re.IGNORECASE)
        if property_match:
            wealth_info["has_properties"] = not ("does not" in property_match.group(0).lower())
            if wealth_info["has_properties"] and property_match.group(1):
                wealth_info["number_of_properties"] = int(property_match.group(1))

        # Extract inheritance
        inheritance_pattern = r'(?:inheritance|inherited) of (\d+(?:,\d+)*) ([A-Za-z]{3}) in (\d{4})'
        inheritance_match = re.search(inheritance_pattern, text)
        if inheritance_match:
            amount_str = inheritance_match.group(1).replace(',', '')
            wealth_info["inheritance"] = {
                "amount": int(amount_str),
                "currency": inheritance_match.group(2),
                "year": int(inheritance_match.group(3))
            }

            # Look for source of inheritance
            context_start = max(0, text.find(inheritance_match.group(0)) - 100)
            context_end = min(len(text), text.find(inheritance_match.group(0)) + 100)
            context = text[context_start:context_end]

            source_patterns = [
                r'(?:grandfather|grandmother|father|mother|uncle|aunt|relative),\s+a\s+([^,\.]+)',
                r'from (?:his|her) ([^,]+), a ([^,\.]+)'
            ]

            for pattern in source_patterns:
                source_match = re.search(pattern, context)
                if source_match:
                    if len(source_match.groups()) == 1:
                        relation = "relative"  # Default if not specified
                        occupation = source_match.group(1).strip()
                    else:
                        relation = source_match.group(1).strip()
                        occupation = source_match.group(2).strip()

                    wealth_info["inheritance"]["source"] = relation
                    wealth_info["inheritance"]["source_occupation"] = occupation
                    break

        # Extract investments and other assets using NLP
        doc = self.nlp(text)
        for sentence in sent_tokenize(text):
            if "investment" in sentence.lower() or "invest" in sentence.lower():
                investment_pattern = r'investments? (?:of|worth|valued at) (\d+(?:,\d+)*) ([A-Za-z]{3})'
                investment_match = re.search(investment_pattern, sentence)
                if investment_match:
                    amount_str = investment_match.group(1).replace(',', '')
                    wealth_info["investments"] = {
                        "amount": int(amount_str),
                        "currency": investment_match.group(2)
                    }

        return wealth_info

    def format_to_client_profile(self, client_info):
        """Format with more fields and handle empty values better."""
        formatted_json = {}

        # Extract and format personal information with more fields
        if "personal_info" in client_info:
            personal = client_info["personal_info"]
            if "name" in personal:
                # Split name into parts
                name_parts = personal.get("name", "").split()
                if len(name_parts) >= 2:
                    formatted_json["last_name"] = name_parts[-1]
                    formatted_json["first_middle_name"] = " ".join(name_parts[:-1])
                else:
                    formatted_json["last_name"] = personal.get("name", "")

            # Map more fields
            field_mappings = {
                "nationality": "nationality",
                "gender": "gender",
                "age": "age",
                "date_of_birth": "date_of_birth",
                "email": "email_address",
                "phone": "phone_number",
                "address": "residential_address",
                "languages": "languages_spoken"
            }

            for source, target in field_mappings.items():
                if source in personal and personal[source]:
                    formatted_json[target] = personal[source]

        # More comprehensive mapping for other sections
        # [additional mappings for family, education, occupation...]

        # Include client preferences
        if "client_preferences" in client_info and any(client_info["client_preferences"]):
            prefs = client_info["client_preferences"]
            if "hobbies" in prefs:
                formatted_json["hobbies_interests"] = prefs["hobbies"]
            if "investment_preferences" in prefs:
                formatted_json["investment_preferences"] = prefs["investment_preferences"]
            if "communication_preference" in prefs:
                formatted_json["communication_preference"] = prefs["communication_preference"]

        # Include risk profile
        if "risk_profile" in client_info and any(client_info["risk_profile"]):
            risk = client_info["risk_profile"]
            if "risk_tolerance" in risk:
                formatted_json["risk_profile"] = risk["risk_tolerance"]
            if "investment_horizon" in risk:
                formatted_json["investment_horizon"] = risk["investment_horizon"]
            if "financial_goals" in risk:
                formatted_json["financial_goals"] = risk["financial_goals"]

        # Include relationship manager info
        if "relationship_manager" in client_info and any(client_info["relationship_manager"]):
            rm = client_info["relationship_manager"]
            if "name" in rm:
                formatted_json["relationship_manager"] = rm["name"]
            if "relation" in rm:
                formatted_json["rm_relation"] = rm["relation"]
            if "relationship_duration" in rm:
                formatted_json["client_since"] = str(2025 - rm["relationship_duration"])  # Assuming current year is 2025
            if "office_location" in rm:
                formatted_json["rm_office"] = rm["office_location"]

        return formatted_json

    def _extract_risk_profile(self, text):
        """Extract client risk profile information."""
        risk_profile = {}

        # Extract risk tolerance
        risk_tolerance_patterns = [
            r'(?:risk tolerance|risk profile) (?:is|:) ([^\.]+)',
            r'has (?:a|an) ([a-zA-Z\s]+) risk (?:tolerance|profile)',
            r'comfortable with ([a-zA-Z\s]+) risk',
            r'(?:prefers|seeks) ([a-zA-Z\s]+) risk'
        ]

        for pattern in risk_tolerance_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                risk_tolerance = match.group(1).strip()
                risk_profile["risk_tolerance"] = risk_tolerance
                break

        # Extract investment horizon
        horizon_patterns = [
            r'investment horizon (?:is|of) ([^\.]+)',
            r'invests for the ([^\.]+) term',
            r'(?:looking at|considering|preferring) ([a-zA-Z\s]+) term investments',
            r'time horizon (?:is|of) ([^\.]+)'
        ]

        for pattern in horizon_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                investment_horizon = match.group(1).strip()
                risk_profile["investment_horizon"] = investment_horizon
                break

        # Extract financial goals
        goal_patterns = [
            r'financial goals? (?:include|is|are|:) ([^\.]+)',
            r'investing (?:for|to) ([^\.]+)',
            r'aims? to ([^\.]+) through (?:his|her|their) investments',
            r'goal is to ([^\.]+)'
        ]

        for pattern in goal_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                goals_text = match.group(1).strip()
                # Split by commas, 'and', or '&'
                goals = [goal.strip() for goal in re.split(r',|and|&', goals_text)]
                # Filter out empty strings
                goals = [goal for goal in goals if goal]
                risk_profile["financial_goals"] = goals
                break

        # Extract preferred investment types
        investment_type_patterns = [
            r'prefers (?:to invest in|investments in) ([^\.]+)',
            r'investment preferences (?:include|:) ([^\.]+)',
            r'invests (?:primarily|mainly|mostly) in ([^\.]+)'
        ]

        for pattern in investment_type_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                investment_types_text = match.group(1).strip()
                # Split by commas, 'and', or '&'
                investment_types = [type_.strip() for type_ in re.split(r',|and|&', investment_types_text)]
                # Filter out empty strings
                investment_types = [type_ for type_ in investment_types if type_]
                risk_profile["preferred_investments"] = investment_types
                break

        # Extract risk-related concerns
        concern_patterns = [
            r'concerned about ([^\.]+) (?:in investments|when investing)',
            r'worried about ([^\.]+) (?:risks|aspects)',
            r'cautious about ([^\.]+)'
        ]

        for pattern in concern_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                risk_profile["risk_concerns"] = match.group(1).strip()
                break

        # Extract portfolio diversification preferences
        diversification_patterns = [
            r'(?:prefers|seeks|wants) (?:a|an) ([^\.]+) diversified portfolio',
            r'diversification (?:is|should be) ([^\.]+)',
            r'portfolio (?:should be|is) ([^\.]+) diversified'
        ]

        for pattern in diversification_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                risk_profile["diversification_preference"] = match.group(1).strip()
                break

        return risk_profile

    def parse_text(self, text):
        """
        Parse the client description text into a structured JSON format.
        """
        # Extract sections from the text
        sections = self._extract_sections(text)
        print(f"Found sections: {list(sections.keys())}")

        # Initialize the result structure
        client_info = {
            "personal_info": {},
            "family_info": {},
            "education_info": [],
            "occupation_info": [],
            "wealth_info": {},
            "client_preferences": {},
            "risk_profile": {},
            "relationship_manager": {},
            "client_summary": ""
        }

        # First pass: Process each identified section
        for section_name, section_content in sections.items():
            print(f"Processing section: {section_name} ({len(section_content)} chars)")
            # Process by section type...

        # Second pass: Process the entire text for any missed information
        print("Processing entire text as fallback...")
        for field, extraction_method in [
            ("personal_info", self._extract_personal_info),
            ("family_info", self._extract_family_info),
            ("education_info", self._extract_education_info),
            ("occupation_info", self._extract_occupation_info),
            ("wealth_info", self._extract_wealth_info),
            ("client_preferences", self._extract_client_preferences),
            ("risk_profile", self._extract_risk_profile),
            ("relationship_manager", self._extract_rm_info)
        ]:
            # Only override if empty
            if not client_info[field] or not any(client_info[field].values() if isinstance(client_info[field], dict) else client_info[field]):
                result = extraction_method(text)
                print(f"Fallback extraction for {field}: Found {len(result) if isinstance(result, list) else len(result.keys()) if result else 0} items")
                client_info[field] = result

        # Return raw client_info for debugging instead of formatted version
        raw_data = client_info
        formatted_data = self.format_to_client_profile(client_info)

        print(f"Raw data has {sum(len(v) if isinstance(v, list) else len(v.keys()) if isinstance(v, dict) else 0 for v in raw_data.values())} items")
        print(f"Formatted data has {len(formatted_data.keys())} fields")

        return raw_data  # Return raw_data temporarily for debugging

    def _extract_occupation_info(self, text):
        """Extract detailed occupation information from text."""
        occupation_info = []
        sentences = sent_tokenize(text)

        # Patterns for current job
        current_job_patterns = [
            r'(?:currently|presently) (?:works|working) as (?:an?|the) ([^\.]+) at ([^\.]+)',
            r'is (?:currently|presently) (?:an?|the) ([^\.]+) at ([^\.]+)',
            r'(?:works|working|employed) as (?:an?|the) ([^\.]+) (?:at|for|with) ([^\.]+)',
            r'position as (?:an?|the) ([^\.]+) (?:at|in|with) ([^\.]+)',
            r'(?:is|as) (?:an?|the) ([^\.]+) (?:at|in|with|for) ([^\.]+)'
        ]

        # Extract current job information
        for sentence in sentences:
            for pattern in current_job_patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    job_title = match.group(1).strip()
                    # Clean up common issues with job title extraction
                    job_title = re.sub(r'\b(a|an|the)\b', '', job_title).strip()

                    company = match.group(2).strip()
                    # Clean up company name (remove trailing punctuation)
                    company = re.sub(r'[,\.\s]+$', '', company).strip()

                    # Create job entry
                    job_entry = {
                        "job_title": job_title,
                        "company": company,
                        "current": True
                    }

                    # Try to extract start date if present
                    date_pattern = r'since (\d{4})'
                    date_match = re.search(date_pattern, sentence)
                    if date_match:
                        job_entry["start_year"] = int(date_match.group(1))

                    # Try to extract industry/sector if present
                    industry_pattern = r'in the ([a-zA-Z\s]+) (?:industry|sector)'
                    industry_match = re.search(industry_pattern, sentence)
                    if industry_match:
                        job_entry["industry"] = industry_match.group(1).strip()

                    occupation_info.append(job_entry)
                    break

        # Patterns for previous jobs
        previous_job_patterns = [
            r'(?:previously|formerly) (?:worked|working) as (?:an?|the) ([^\.]+) at ([^\.]+)',
            r'(?:worked|was) as (?:an?|the) ([^\.]+) (?:at|in|with) ([^\.]+) (?:from|between) (\d{4})(?:[^\d]|$)',
            r'(?:worked|was) (?:at|for|with) ([^\.]+) as (?:an?|the) ([^\.]+)',
            r'experience as (?:an?|the) ([^\.]+) at ([^\.]+)'
        ]

        # Extract previous job information
        for sentence in sentences:
            for pattern in previous_job_patterns:
                match = re.search(pattern, sentence, re.IGNORECASE)
                if match:
                    # Handle different pattern groupings
                    if "from|between" in pattern:
                        job_title = match.group(1).strip()
                        company = match.group(2).strip()
                        start_year = int(match.group(3))

                        # Try to extract end year if present
                        end_year_pattern = r'(?:to|until|and) (\d{4})'
                        end_year_match = re.search(end_year_pattern, sentence)
                        end_year = int(end_year_match.group(1)) if end_year_match else None
                    elif "experience as" in pattern:
                        job_title = match.group(1).strip()
                        company = match.group(2).strip()
                        start_year = None
                        end_year = None
                    elif "worked|was at|for|with" in pattern:
                        company = match.group(1).strip()
                        job_title = match.group(2).strip()
                        start_year = None
                        end_year = None
                    else:
                        job_title = match.group(1).strip()
                        company = match.group(2).strip()
                        start_year = None
                        end_year = None

                    # Clean up job title and company
                    job_title = re.sub(r'\b(a|an|the)\b', '', job_title).strip()
                    company = re.sub(r'[,\.\s]+$', '', company).strip()

                    # Create job entry
                    job_entry = {
                        "job_title": job_title,
                        "company": company,
                        "current": False
                    }

                    # Add years if present
                    if start_year:
                        job_entry["start_year"] = start_year
                    if end_year:
                        job_entry["end_year"] = end_year

                    # Try to extract industry/sector if present
                    industry_pattern = r'in the ([a-zA-Z\s]+) (?:industry|sector)'
                    industry_match = re.search(industry_pattern, sentence)
                    if industry_match:
                        job_entry["industry"] = industry_match.group(1).strip()

                    # Check if it's not a duplicate of current job
                    is_duplicate = False
                    for existing_job in occupation_info:
                        if (existing_job.get("job_title", "").lower() == job_title.lower() and
                            existing_job.get("company", "").lower() == company.lower()):
                            is_duplicate = True
                            break

                    if not is_duplicate:
                        occupation_info.append(job_entry)
                    break

        # Extract career length if available
        career_length_pattern = r'(?:career|experience) (?:of|spanning) (\d+) years'
        career_match = re.search(career_length_pattern, text, re.IGNORECASE)
        if career_match:
            # Add career length as a separate item in the info list
            career_info = {
                "career_length_years": int(career_match.group(1))
            }
            occupation_info.append(career_info)

        # Extract industry expertise
        expertise_patterns = [
            r'expertise in ([^\.]+)',
            r'specialized in ([^\.]+)',
            r'specialization in ([^\.]+)'
        ]

        for pattern in expertise_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                expertise = match.group(1).strip()
                # Add expertise as a separate item
                expertise_info = {
                    "expertise": expertise
                }
                occupation_info.append(expertise_info)
                break

        # If no specific job entries were found, try simpler extraction
        if not any(("job_title" in job) for job in occupation_info):
            # Look for simple mentions of profession
            profession_patterns = [
                r'is an? ([A-Za-z\s]+) (?:by profession|by trade)',
                r'(?:profession|occupation) is ([^\.]+)',
                r'works as an? ([^\.]+)',
                r'career as an? ([^\.]+)'
            ]

            for pattern in profession_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    profession = match.group(1).strip()
                    profession = re.sub(r'\b(a|an|the)\b', '', profession).strip()

                    occupation_info.append({
                        "job_title": profession,
                        "current": True
                    })
                    break

            return occupation_info

    def _extract_client_preferences(self, text):
        """Extract client preferences and interests."""
        preferences = {}

        # Extract hobbies and interests
        hobby_patterns = [
            r'(?:hobbies|interests|enjoys|enjoys doing) (?:include|:)? ([^\.]+)',
            r'fond of ([^\.]+)',
            r'passionate about ([^\.]+)',
            r'leisure time (?:is spent|includes) ([^\.]+)'
        ]

        for pattern in hobby_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                hobbies_text = match.group(1).strip()
                # Split by commas, 'and', or '&'
                hobbies = [hobby.strip() for hobby in re.split(r',|and|&', hobbies_text)]
                # Filter out empty strings
                hobbies = [hobby for hobby in hobbies if hobby]
                preferences["hobbies"] = hobbies
                break

        # Extract investment preferences
        investment_patterns = [
            r'prefers investments? in ([^\.]+)',
            r'investment preferences? (?:include|:) ([^\.]+)',
            r'interested in investing in ([^\.]+)',
            r'prefers (?:to invest|investing) in ([^\.]+)'
        ]

        for pattern in investment_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                investment_text = match.group(1).strip()
                # Split by commas, 'and', or '&'
                investments = [inv.strip() for inv in re.split(r',|and|&', investment_text)]
                # Filter out empty strings
                investments = [inv for inv in investments if inv]
                preferences["investment_preferences"] = investments
                break

        # Extract communication preferences
        communication_patterns = [
            r'prefers (?:to be contacted|communication) (?:via|through|by) ([^\.]+)',
            r'communication preference is ([^\.]+)',
            r'(?:prefers|favors) ([^\.]+) (?:as a means of|for) communication'
        ]

        for pattern in communication_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                preferences["communication_preference"] = match.group(1).strip()
                break

        # Extract meeting frequency preferences
        meeting_patterns = [
            r'prefers (?:to meet|meetings) ([^\.]+)',
            r'likes to be updated ([^\.]+)',
            r'prefers (?:to be contacted|contact) ([^\.]+)'
        ]

        for pattern in meeting_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                meeting_pref = match.group(1).strip()
                # Check if it contains frequency information
                if re.search(r'(?:weekly|monthly|quarterly|annually|biweekly|daily)', meeting_pref, re.IGNORECASE):
                    preferences["meeting_frequency"] = meeting_pref
                    break

        # Extract financial service preferences
        service_patterns = [
            r'interested in ([^\.]+) services',
            r'looking for ([^\.]+) (?:advice|services|solutions)',
            r'seeking ([^\.]+) (?:services|advice)'
        ]

        for pattern in service_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                service_text = match.group(1).strip()
                # Split by commas, 'and', or '&'
                services = [service.strip() for service in re.split(r',|and|&', service_text)]
                # Filter out empty strings
                services = [service for service in services if service]
                preferences["service_preferences"] = services
                break

        # Extract travel preferences
        travel_patterns = [
            r'(?:enjoys|likes) (?:traveling|travelling) to ([^\.]+)',
            r'(?:travel|travelling|traveling) (?:preferences|interests) (?:include|:) ([^\.]+)',
            r'traveled to ([^\.]+)'
        ]

        for pattern in travel_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                travel_text = match.group(1).strip()
                # Split by commas, 'and', or '&'
                travel = [destination.strip() for destination in re.split(r',|and|&', travel_text)]
                # Filter out empty strings
                travel = [destination for destination in travel if destination]
                preferences["travel_interests"] = travel
                break

        # Extract lifestyle preferences
        lifestyle_patterns = [
            r'lifestyle (?:includes|revolves around) ([^\.]+)',
            r'values ([^\.]+) in (?:life|day-to-day)',
            r'prioritizes ([^\.]+) in (?:life|personal matters)'
        ]

        for pattern in lifestyle_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                lifestyle_text = match.group(1).strip()
                # Split by commas, 'and', or '&'
                lifestyle = [value.strip() for value in re.split(r',|and|&', lifestyle_text)]
                # Filter out empty strings
                lifestyle = [value for value in lifestyle if value]
                preferences["lifestyle_values"] = lifestyle
                break

        return preferences

def process_client_description_file(file_path, output_json_path=None):
    """
    Process a client description text file and convert it to JSON.

    Args:
        file_path (str): Path to the client description text file
        output_json_path (str, optional): Path where to save the JSON output.
                                        If None, will use the same name as input but with .json extension

    Returns:
        dict: The structured JSON data
    """
    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Input file not found: {file_path}")

    # Read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()

    # Process the content
    parser = ClientDescriptionParser()
    json_data = parser.parse_text(text_content)

    # Determine output path if not provided
    if output_json_path is None:
        file_name, _ = os.path.splitext(file_path)
        output_json_path = f"{file_name}.json"

    # Write to JSON file
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, indent=2, ensure_ascii=False)

    print(f"Successfully processed {file_path}")
    print(f"JSON file created at {output_json_path}")

    return json_data

# Execute the processing if this script is run directly
if __name__ == "__main__":
    # Process the file at description_1.txt
    file_path = "description_2.txt"
    result = process_client_description_file(file_path)

    # Print the result to console as well
    print("\nExtracted JSON data:")
    print(json.dumps(result, indent=2))

Found sections: ['summary', 'family', 'education', 'occupation']
Processing section: summary (145 chars)
Processing section: family (72 chars)
Processing section: education (131 chars)
Processing section: occupation (668 chars)
Processing entire text as fallback...
Fallback extraction for personal_info: Found 5 items
Fallback extraction for family_info: Found 0 items
Fallback extraction for education_info: Found 1 items
Fallback extraction for occupation_info: Found 0 items
Fallback extraction for wealth_info: Found 0 items
Fallback extraction for client_preferences: Found 0 items
Fallback extraction for risk_profile: Found 0 items
Fallback extraction for relationship_manager: Found 0 items
Raw data has 6 items
Formatted data has 5 fields
Successfully processed description_2.txt
JSON file created at description_2.json

Extracted JSON data:
{
  "personal_info": {
    "name": "Lina Carolin Zimmermann",
    "age": 69,
    "nationality": "Germany",
    "gender": "Female",
    "profession":