# this script builds a loop over all documents to extract the beccessary information

In [3]:
#load all neccessary packages
import pandas
import os
import requests
import re
import pickle
from bs4 import BeautifulSoup

In [4]:
#define the directory of the datafiles I want to loop over
user = '/Users/natalies_macbook/Documents/' #change this to your folder where GitHub sits
directory = user + 'GitHub/INSA/EU/relevant'

# patterns for court decisions

In [6]:
court_decision_pattern = r'<p class="oj-doc-ti".*?>Judgment of the General Court.*?</p>'

def extract_court_decision(html_content):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Define the patterns
    delist_pattern = r'Annuls'
    delisting_case_pattern = r'Annuls (.+?);2.Orders'
    name_pattern2 = r'in so far as they concern (.+?);2.Orders'
    name_pattern1 = r'Orders (.+?) to pay'
    
    # Initialize variables
    Delisting = 0  # Default to 0 (not a delisting)
    delisting_case = None
    entity_name = None
    
    # Extract the main text content from the relevant <p> tags
    paragraphs = soup.find_all('p', class_='oj-normal')
    text = ' '.join(p.get_text() for p in paragraphs)
    
    # Check for delisting
    if re.search(delist_pattern, text):
        Delisting = 1  # Set to 1 if it matches the delisting pattern
        match = re.search(name_pattern2, text)
        if match:
            entity_name = match.group(1)
    else:
        match = re.search(name_pattern1, text)
        if match:
            entity_name = match.group(1)
    
    # Extract delisting case
    match = re.search(delisting_case_pattern, text)
    if match:
        delisting_case = match.group(1)
    
    # Extract additional details using BeautifulSoup and regex
    return {
        'entity_name': entity_name,
        'Delisting': Delisting,  # Binary indicator (1 for delisting, 0 otherwise)
        'Delisting_case': delisting_case,
        'url': [a['href'] for a in soup.find_all('a', href=True) if '.eu/legal-content/EN/TXT/HTML/' in a['href']],
        'celex': re.findall(r'uri=CELEX%([^"]+)', str(soup)),
        'publication_date': re.findall(r'\d{1,2}\.\d{1,2}\.\d{4}', str(soup.find('p', class_='oj-hd-date'))),
        'date': re.findall(r'of (\d+ \w+ \d+)', str(soup.find('p', class_='oj-doc-ti'))),
        'title': [t.get_text() for t in soup.find_all('p', class_='oj-doc-ti')],
        'law_number': re.findall(r'C (\d+/\d+)', str(soup.find('p', class_='oj-hd-oj'))),
        'case_number': re.findall(r'Case\sT-\d+/\d+', str(soup)),
        'legal_base': re.findall(r'\((CFSP|EU)\)\s*(\d+/\d+)', str(soup))
    }


# patterns for notices

In [8]:
notice_pattern = r'<p class="oj-doc-ti".*?">Notice.*?</p>'

def notice_pattern_extraction(soup):
    # Document base information
    url_pattern = r'href="(.*?\.eu/legal-content/EN/TXT/HTML/.*?)"'
    celex_pattern = r'uri=CELEX%([^"]+)'
    
    # Extract publication date and title
    publication_date = soup.find('p', class_='oj-hd-date')
    publication_date = publication_date.get_text(strip=True) if publication_date else None
    
    title = soup.find('p', class_='oj-doc-ti')
    title = title.get_text(strip=True) if title else None
    # turn soup into string
    soup_str = str(soup)
    # Extract URL and CELEX
    url_match = re.search(url_pattern, soup_str)
    celex_match = re.search(celex_pattern, soup_str)
    url = url_match.group(1) if url_match else None
    celex = celex_match.group(1) if celex_match else None
    
    # Document information patterns
    decision_match_pattern = r'(Decision[^,]*|Regulation[^,]*|Annex[^,]*)'
    legal_action_pattern = r'<p[^>]*>(.*?(implementing|amended|implemented).*?)</p>'
    legal_base_pattern = r'<p[^>]*>(.*?regard to|amending.*?)</p>'
    case_number_pattern = r'Case\sT-\d+/\d+'
    law_number_pattern = r'<p class="oj-hd-oj">C (\d+/\d+)</p>'
    legal_base_alt_pattern = r'\((CFSP|EU)\)\s*(\d+/\d+)'
    
    decision_match = re.search(decision_match_pattern, soup_str)
    legal_action = re.search(legal_action_pattern, soup_str)
    legal_base = re.search(legal_base_pattern, soup_str)
    case_number = re.search(case_number_pattern, soup_str)
    law_number = re.search(law_number_pattern, soup_str)
    legal_base_alt = re.search(legal_base_alt_pattern, soup_str)
    
    # List updates, removals, deletions
    action_intended_pattern = r'<p class="(oj-normal|oj-bold|normal|bold)">\s*(.*?\s*(?:removed|updated|deleted|replaced|amended)\s*.*?)\s*</p>'
    action_intended = re.search(action_intended_pattern, str(soup))
    notice_target = re.search(r'The following information is brought to the attention of\s*(.*?),', soup_str)
    
    return {'publication_date': publication_date,
            'title': title,
            'url': url,
            'celex': celex,
            'decision_match': decision_match.group(1) if decision_match else None,
            'legal_action': legal_action.group(1) if legal_action else None,
            'legal_base': legal_base.group(1) if legal_base else None,
            'case_number': case_number.group(0) if case_number else None,
            'law_number': law_number.group(1) if law_number else None,
            'legal_base_alt': legal_base_alt.group(0) if legal_base_alt else None,
            'action_intended': action_intended.group(2) if action_intended else None,
            'notice_target':notice_target.group(1) if notice_target else None}


# main pattern all

In [10]:
#paterns for the general listing documents
def extract_document_info(soup):
    # Document base information
    url_pattern = r'href="(.*?\.eu/legal-content/EN/TXT/HTML/.*?)"'
    celex_pattern = r'uri=CELEX%([^"]+)'
    
    # Extract publication date and title
    publication_date = soup.find('p', class_='oj-hd-date')
    publication_date = publication_date.get_text(strip=True) if publication_date else None
    
    title = soup.find('p', class_='oj-doc-ti')
    title = title.get_text(strip=True) if title else None
    # turn soup into string
    soup_str = str(soup)
    # Extract URL and CELEX
    url_match = re.search(url_pattern, soup_str)
    celex_match = re.search(celex_pattern, soup_str)
    url = url_match.group(1) if url_match else None
    celex = celex_match.group(1) if celex_match else None
    
    # Document information patterns
    decision_match_pattern = r'(Decision[^,]*|Regulation[^,]*|Annex[^,]*)</p>'
    legal_action_pattern = r'<p[^>]*>.*?(implementing|amended|implemented).*?</p>'
    legal_base_pattern = r'<p[^>]*>(.*?regard to|amending.*?)</p>'
    case_number_pattern = r'Case\sT-\d+/\d+'
    law_number_pattern = r'<p class="oj-hd-oj">(C (\d+/\d+))</p>'
    legal_base_alt_pattern = r'\((CFSP|EU)\)\s*(\d+/\d+)'
    
    decision_match = re.search(decision_match_pattern, soup_str)
    legal_action = re.search(legal_action_pattern, soup_str)
    legal_base = re.search(legal_base_pattern, soup_str)
    case_number = re.search(case_number_pattern, soup_str)
    law_number = re.search(law_number_pattern, soup_str)
    legal_base_alt = re.search(legal_base_alt_pattern, soup_str)
    
    # List updates, removals, deletions
    action_intended_pattern = r'<p class="(oj-normal|oj-bold|normal|bold)">\s*(.*?\s*(?:removed|updated|deleted|replaced|amended)\s*.*?)\s*</p>'
    action_intended = re.search(action_intended_pattern, str(soup))
    
    # Annex with numbers
    listing_position_pattern = r'class="oj-normal|normal">(\d+.*?)'
    name_2_pattern = r'<p class="oj-normal|normal">\s*<span class="oj-bold|bold">(.*?)</span>'
    un_sanction_date_pattern = r'class="oj-normal|normal">Date of UN designation(.*?)</p>'
    identifying_information_pattern = r'>(.*?Date of Birth:|DOB.*?)</p>'
    
    listing_position = re.search(listing_position_pattern, soup_str)
    name_2 = re.search(name_2_pattern, soup_str)
    un_sanction_date = re.search(un_sanction_date_pattern, soup_str)
    un_sanction = '1' if un_sanction_date else '0'
    identifying_information = re.search(identifying_information_pattern, soup_str) #also fix this here!
    
    # Return all extracted information
    return {
        'publication_date': publication_date,
        'title': title,
        'url': url,
        'celex': celex,
        'decision_match': decision_match.group(1) if decision_match else None,
        'legal_action': legal_action.group(1) if legal_action else None,
        'legal_base': legal_base.group(1) if legal_base else None,
        'case_number': case_number.group(0) if case_number else None,
        'law_number': law_number.group(1) if law_number else None,
        'legal_base_alt': legal_base_alt.group(0) if legal_base_alt else None,
        'action_intended': action_intended.group(2) if action_intended else None,
        'listing_position': listing_position.group(1) if listing_position else None,
        'name_2': name_2.group(1) if name_2 else None,
        'un_sanction_date': un_sanction_date.group(1) if un_sanction_date else None,
        'un_sanction': un_sanction,
        'identifying_information': identifying_information.group(1) if identifying_information else None,
    }
print('successfully memorized the listing patterns')

successfully memorized the listing patterns



notice_list = []
sanction_list = []
court_decision_list = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, encoding='utf-8') as f:
                data = f.read()
        except UnicodeDecodeError:
            with open(file_path, encoding='latin-1') as f:
                data = f.read()
        
        # Parse the file content with BeautifulSoup
        soup = BeautifulSoup(data, 'html.parser')
        
        # Extract document-level information
        document_info = extract_document_info(soup)
        
        # Check if the document matches a court decision pattern
        if re.findall(court_decision_pattern, str(soup)):
            court_info = extract_court_decision(soup)
            court_decision_list.append(court_info)
        
        # Check if the document matches a notice pattern
        if re.findall(notice_pattern, str(soup)):
            notice_info = notice_pattern_extraction(soup)
            notice_list.append(notice_info)
        
        # Find all table rows of the class 'oj-table' and extract relevant information
        rows = soup.find_all('tr', class_='oj-table')
        for row in rows:
            cols = row.find_all('td')
            
            # Extract name, alias, and other relevant details
            
            if cols:
                for col in cols:
                    header_text = col.get('data-header')  # Replace 'data-header' with the actual attribute or use another method
            
            if header_text == 'Name':
                entity_info['name'] = col.get_text(strip=True)
            elif header_text == 'Reasons':
                entity_info['reason'] = col.get_text(strip=True)
            elif header_text == 'Identifying Information':
                entity_info['ident_info'] = col.get_text(strip=True)
            elif header_text == 'Date of Listing':
                listing_date[''] = col.get_text(strip=True)
            # Extract information using regular expressions
            position = re.search(r'Position\(s\):(.*?)</p>', str(row))
            dob = re.search(r'(?:DOB|Date of Birth|Born on): (\d{1,2}/\d{1,2}/\d{4})', str(row))
            pob = re.search(r'POB|Place of Birth:(.*?)</p>', str(row))
            nationality = re.search(r'Nationality:(.*?)</p>', str(row))
            gender = re.search(r'Gender:(.*?)</p>', str(row))
            passport = re.search(r'Passport Number:(.*?)</p>', str(row))
            social_media = re.search(r'Social media:(.*?)</p>', str(row))
            email = re.search(r'Email:(.*?)</p>', str(row))
            telephone = re.search(r'Telephone:(.*?)</p>', str(row))
            address = re.search(r'Address:(.*?)</p>', str(row))
            code = re.search(r'Code:(.*?)</p>', str(row))
            date_of_listing = re.search(r'\d{1,2}/\d{1,2}/\d{4}', str(row))
            
            # Combine document_info with row-specific information into a dictionary
            sanction_dict = {
                **document_info,
                'name_target': name,
                'alias': alias,
                'position_target': position.group(1) if position else None,
                'date_of_birth': dob.group(1) if dob else None,
                'place_of_birth': pob.group(1) if pob else None,
                'nationality': nationality.group(1) if nationality else None,
                'gender': gender.group(1) if gender else None,
                'passport_number': passport.group(1) if passport else None,
                'social_media': social_media.group(1) if social_media else None,
                'email': email.group(1) if email else None,
                'telephone': telephone.group(1) if telephone else None,
                'address': address.group(1) if address else None,
                'code': code.group(1) if code else None,
                'date_of_listing': date_of_listing.group(0) if date_of_listing else None
            }
            
            sanction_list.append(sanction_dict)

In [32]:
#define all the objects
notice_list = []
sanction_list = []
court_decision_list = []
ident_info = None
reason = None
listing_date = None
target_name = None

# Loop through all files in the directory

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, encoding='utf-8') as f:
                data = f.read()
        except UnicodeDecodeError:
            with open(file_path, encoding='latin-1') as f:
                data = f.read()
        
        # Parse the file content with BeautifulSoup
        soup = BeautifulSoup(data, 'html.parser')
        
        # Extract document-level information
        document_info = extract_document_info(soup)
        
        # Check if the document matches a court decision pattern
        if re.findall(court_decision_pattern, str(soup)):
            court_info = extract_court_decision(str(soup))
            court_decision_list.append(court_info)
        
        # Check if the document matches a notice pattern
        if re.findall(notice_pattern, str(soup)):
            notice_info = notice_pattern_extraction(soup)
            notice_list.append(notice_info)
        
        # Find all table rows of the class 'oj-table' and extract relevant information
        rows = soup.find_all('tr', class_='oj-table')
        for row in rows:
            cols = row.find_all('td')
            if cols:
                for col in cols:
                    header_text = col.get('data-header')  # this is for listings in the annex in table format
                    
                    if header_text == 'Name':
                        target_name = col.get_text(strip=True)
                    elif header_text == 'Reasons':
                        reason = col.get_text(strip=True)
                    elif header_text == 'Identifying Information':
                        ident_info = col.get_text(strip=True)
                    elif header_text == 'Date of Listing':
                        listing_date = col.get_text(strip=True)

            # Extract additional information using regular expressions
                    position = re.search(r'Position\(s\):(.*?)</p>', ident_info) if ident_info else None
                    dob = re.search(r'(?:DOB|Date of Birth|Born on): (\d{1,2}/\d{1,2}/\d{4})', ident_info) if ident_info else None
                    pob = re.search(r'POB|Place of Birth:(.*?)</p>', ident_info)if ident_info else None
                    nationality = re.search(r'Nationality:(.*?)</p>', ident_info)if ident_info else None
                    gender = re.search(r'Gender:(.*?)</p>', ident_info)if ident_info else None
                    passport = re.search(r'Passport Number:(.*?)</p>', ident_info)if ident_info else None
                    social_media = re.search(r'Social media:(.*?)</p>', ident_info)if ident_info else None
                    email = re.search(r'Email:(.*?)</p>', ident_info)if ident_info else None
                    telephone = re.search(r'Telephone:(.*?)</p>', ident_info)if ident_info else None
                    address = re.search(r'Address:(.*?)</p>', ident_info)if ident_info else None
                    code = re.search(r'Code:(.*?)</p>', ident_info)if ident_info else None
                    date_of_listing = re.search(r'\d{1,2}/\d{1,2}/\d{4}', ident_info)if ident_info else None
            for row in rows:
                row_str = str(row)  # Convert the entire row to a string for regex matching
                listing_position_pattern = r'class="oj-normal">(\d+.*?)'
                name_2_pattern = r'<p class="oj-normal">\s*<span class="oj-bold">(.*?)</span>'
                identifying_info_pattern = r'<p class="oj-normal">(.*?)</p>'
                un_sanction_date_pattern = r'class="oj-normal|normal">Date of UN designation(.*?)</p>'
                listing_position = re.search(listing_position_pattern, row_str)
                name_2 = re.search(name_2_pattern, row_str)
                un_sanction_date = re.search(un_sanction_date_pattern, row_str)
                un_sanction = '1' if un_sanction_date else '0'
                identifying_info = re.findall(identifying_info_pattern, row_str)  # Use findall to get multiple <p> tags
                if identifying_info:
                    name_3 = identifying_info[0]  # Assuming the first <p> contains the name
                    if len(identifying_info) > 2:
                        reason_3 = identifying_info[2]
                    if len(identifying_info) > 1:
                        reason_3 = identifying_info[1]
                    else:
                        reson_3 = None
                    dob_3 = next((info for info in identifying_info if "Date of Birth:" in info or "DOB" in info), None)
                    passport_number_3 = next((info for info in identifying_info if "Passport Number:" in info), None)
                    pob_3 = next((info for info in identifying_info if "Place of Birth:" in info or "POB" in info), None)

            # Combine document_info with row-specific information into a dictionary
            sanction_dict = {
                **document_info,  # Include document-level information
                'name_target': target_name if target_name else None,
                #'alias': alias if alias else None,
                'reason_txt':reason if reason else None,
                'position_target': position if position else None,
                'date_of_birth': dob if dob else None,
                'place_of_birth': pob if pob else None,
                'nationality': nationality if nationality else None,
                'gender': gender if gender else None,
                'passport_number': passport if passport else None,
                'social_media': social_media if social_media else None,
                'email': email if email else None,
                'telephone': telephone if telephone else None,
                'address': address if address else None,
                'code': code if code else None,
                'date_of_listing': listing_date if listing_date else None,
                'listing_position': listing_position.group(1) if listing_position else None,
                'name_target_2': name_2.group(1) if name_2 else None,
                'un_sanction': un_sanction,
                'name_target_3': name_3,
                'dob_3': dob_3,
                'passport_number_3': passport_number_3,
                'pob_3': pob_3
            }
            sanction_list.append(sanction_dict)

KeyboardInterrupt: 

*To Do:

1. remove html markers like /xa

2. clean up identifying_info key

3. get rid of listed commodities

In [None]:
rows = soup.find_all('tr', class_='oj-table')
for row in rows:
    row_str = str(row)  # Convert the entire row to a string for regex matching

    listing_position_pattern = r'class="oj-normal">(\d+.*?)'
    name_2_pattern = r'<p class="oj-normal">\s*<span class="oj-bold">(.*?)</span>'
    identifying_info_pattern = r'<p class="oj-normal">(.*?)</p>'
    un_sanction_date_pattern = r'class="oj-normal|normal">Date of UN designation(.*?)</p>'

    listing_position = re.search(listing_position_pattern, row_str)
    name_2 = re.search(name_2_pattern, row_str)
    un_sanction_date = re.search(un_sanction_date_pattern, row_str)
    un_sanction = '1' if un_sanction_date else '0'
    identifying_info = re.findall(identifying_info_pattern, row_str)  # Use findall to get multiple <p> tags

    if identifying_info:
        name_3 = identifying_info[0]  # Assuming the first <p> contains the name
        reason_3 = identifying_info[2]

        # Look for specific information within the identifying info
        dob_3 = next((info for info in identifying_info if "Date of Birth:" in info or "DOB" in info), None)
        passport_number_3 = next((info for info in identifying_info if "Passport Number:" in info), None)
        pob_3 = next((info for info in identifying_info if "Place of Birth:" in info or "POB" in info), None)

        # Clean up the extracted data
        dob_3 = dob_3.strip() if dob_2 else None
        passport_number_3 = passport_number_3.strip() if passport_number_3 else None
        pob_3 = pob_3.strip() if pob_3 else None

        # Example output dictionary, replace with your storage logic
        extracted_data = {
            'listing_position': listing_position.group(1) if listing_position else None,
            'name_2': name_2.group(1) if name_2 else None,
            'un_sanction': un_sanction,
            'name_3': name_3,
            'dob_2': dob_2,
            'passport_number_2': passport_number_2,
            'pob_2': pob_2
        }
        
        print(extracted_data)  # or store it in a list or a file


In [None]:
#annex with numbers:
 rows = soup.find_all('tr', class_='oj-table')
        for row in rows:
            cols = row.find_all('td')
            if cols:
                listing_position_pattern = r'class="oj-normal">(\d+.*?)'
                name_2_pattern = r'<p class="oj-normal">\s*<span class="oj-bold">(.*?)</span>'
                identifying_info_pattern = r'<p class="oj-normal">(.*?)<p/>'
                un_sanction_date_pattern = r'class="oj-normal|normal">Date of UN designation(.*?)</p>'
                
                listing_position = re.search(listing_position_pattern, soup_str)
                name_2 = re.search(name_2_pattern, soup_str)
                un_sanction_date = re.search(un_sanction_date_pattern, soup_str)
                un_sanction = '1' if un_sanction_date else '0'
                identifying_info = re.search(identifying_info_pattern, soup_str) #also fix this here!
                
                name_3 = identifying_info.group(1)
                reason = identifying_info.group(3)
                dob_2 = re.search('>(Date of Birth:|DOB.*?)</p>',identifying_info)
                passport_number_2 = re.search('>(Passport Number:.*?)</p>',identifying_info)
                pob_2 = re.search('>(.*?Place of Birth:|POB.*?)</p>',identifying_info)

In [None]:
48-1">‘ANNEX IV</p>
                     <p class="oj-doc-ti oj-quotation-ti">List of natural or legal persons, entities or bodies, referred to in Article 2(7), 2a(7) and 2b(1)</p>
    
     <table width="100%" border="0" cellspacing="0" cellpadding="0">
                        <col width="100%"/>
                        <tbody>
                           <tr>
                              <td valign="top"  >
                                 <p class="oj-normal">JSC Sirius</p>
                              </td>
                           </tr>
                        </tbody>
                     </table>
                     <table width="100%" border="0" cellspacing="0" cellpadding="0">
                        <col width="100%"/>
           
        
        <p class="oj-doc-ti" id="d1e34-59-1">ANNEX I</p>
            <div>
                <div>
                    <div>
                        <p class="oj-doc-ti oj-quotation-ti" id="d1e41-59-1">
                                             ‘ANNEX I</p>
                        <p id="d1e48-59-1" class="oj-ti-grseq-1">
                            <span class="oj-bold">List of persons referred to in Article 5(1)(a)</span>
                        </p>
                        <table width="100%" border="0" cellspacing="0" cellpadding="0">
                            <col width="4%"/>
                            <col width="96%"/>
                            <tbody>
                                <tr>
                                    <td valign="top">
                                        <p class="oj-normal">1.</p>
                                    </td>
                                    <td valign="top">
                                        <p class="oj-normal">
                                            <span class="oj-bold">AL-BAGHDADI, Dr Abdulqader Mohammed</span>
                                        </p>
                                        <p class="oj-normal">Passport number: B010574. Date of birth: 1.7.1950.</p>
                                        <p class="oj-normal">Head of the Liaison Office of the Revolutionary Committees. Revolutionary Committees involved in violence against demonstrators.</p>
                                        <p class="oj-normal">Date of UN designation: 26.2.2011.</p>
                                    </td>
                                </tr>
                            </tbody>
                        </table>
                        <table width="100%" border="0" cellspacing="0" cellpadding="0">
                            <col width="4%"/>
                            <col width="96%"/>
                            <tbody>
                                <tr>
                                    <td valign="top">
                                        <p class="oj-normal">2.</p>
                                    </td>
                                    <td valign="top">
                                        <p class="oj-normal">
                                            <span class="oj-bold">DIBRI, Abdulqader Yusef</span>
                                        </p>
                                        <p class="oj-normal">Date <tbody>
    
    

# pickling the results

In [None]:

with open("batched_sanction_EU", "wb") as fp:   #Pickling
    pickle.dump(sanction_list, fp)

with open("batched_notice_EU", "wb") as fp:   #Pickling
    pickle.dump(notice_list, fp)

with open("batched_court_decision_EU", "wb") as fp:   #Pickling
    pickle.dump(court_decision_list, fp)

In [None]:

#with open("batched_sanction_EU", "rb") as fp:   #Pickling
#    pickle.load(fp)

In [None]:
import os
import re
from bs4 import BeautifulSoup

# Define all the objects
notice_list = []
sanction_list = []
court_decision_list = []
ident_info = None
reason = None
listing_date = None
target_name = None

# Function to extract entity type based on preceding text
def extract_entity_type(element):
    person_pattern = re.compile(r'List of persons referred to', re.IGNORECASE)
    entity_pattern = re.compile(r'List of entities referred to', re.IGNORECASE)

    text = element.get_text(strip=True)
    if person_pattern.search(text):
        return 'person'
    elif entity_pattern.search(text):
        return 'entity'
    return None

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, encoding='utf-8') as f:
                data = f.read()
        except UnicodeDecodeError:
            with open(file_path, encoding='latin-1') as f:
                data = f.read()

        # Parse the file content with BeautifulSoup
        soup = BeautifulSoup(data, 'html.parser')

        # Extract document-level information
        document_info = extract_document_info(soup)

        # Check if the document matches a court decision pattern
        if re.findall(court_decision_pattern, str(soup)):
            court_info = extract_court_decision(soup)
            court_decision_list.append(court_info)

        # Check if the document matches a notice pattern
        if re.findall(notice_pattern, str(soup)):
            notice_info = notice_pattern_extraction(soup)
            notice_list.append(notice_info)

        # Initialize the current entity type variable
        current_entity_type = None

        # Iterate through all elements in the document
        for element in soup.find_all(['p', 'table', 'ul', 'ol']):
            # Update the entity type based on preceding text
            entity_type = extract_entity_type(element)
            if entity_type:
                current_entity_type = entity_type

            # Process elements based on the current entity type
            if current_entity_type:
                # Process tables
                if element.name == 'table':
                    rows = element.find_all('tr', class_='oj-table')
                    for row in rows:
                        cols = row.find_all('td')
                        if cols:
                            sanction_dict = {}
                            for col in cols:
                                header_text = col.get('data-header')
                                
                                if header_text == 'Name':
                                    target_name = col.get_text(strip=True)
                                elif header_text == 'Reasons':
                                    reason = col.get_text(strip=True)
                                elif header_text == 'Identifying Information':
                                    ident_info = col.get_text(strip=True)
                                elif header_text == 'Date of Listing':
                                    listing_date = col.get_text(strip=True)

                                # Extract additional information using regular expressions
                                position = re.search(r'Position\(s\):(.*?)</p>', str(col)) if ident_info else None
                                dob = re.search(r'(?:DOB|Date of Birth|Born on): (\d{1,2}/\d{1,2}/\d{4})', str(col)) if ident_info else None
                                pob = re.search(r'POB|Place of Birth:(.*?)</p>', str(col)) if ident_info else None
                                nationality = re.search(r'Nationality:(.*?)</p>', str(col)) if ident_info else None
                                gender = re.search(r'Gender:(.*?)</p>', str(col)) if ident_info else None
                                passport = re.search(r'Passport Number:(.*?)</p>', str(col)) if ident_info else None
                                social_media = re.search(r'Social media:(.*?)</p>', str(col)) if ident_info else None
                                email = re.search(r'Email:(.*?)</p>', str(col)) if ident_info else None
                                telephone = re.search(r'Telephone:(.*?)</p>', str(col)) if ident_info else None
                                address = re.search(r'Address:(.*?)</p>', str(col)) if ident_info else None
                                code = re.search(r'Code:(.*?)</p>', str(col)) if ident_info else None
                                date_of_listing = re.search(r'\d{1,2}/\d{1,2}/\d{4}', str(col)) if ident_info else None

                                # Add the extracted information to the sanction dictionary
                                sanction_dict = {
                                    **document_info,  # Include document-level information
                                    'entity_type': current_entity_type,
                                    'name_target': target_name if target_name else None,
                                    'alias': alias if alias else None,
                                    'reason_txt': reason if reason else None,
                                    'position_target': position.group(1).strip() if position else None,
                                    'date_of_birth': dob.group(1).strip() if dob else None,
                                    'place_of_birth': pob.group(1).strip() if pob else None,
                                    'nationality': nationality.group(1).strip() if nationality else None,
                                    'gender': gender.group(1).strip() if gender else None,
                                    'passport_number': passport.group(1).strip() if passport else None,
                                    'social_media': social_media.group(1).strip() if social_media else None,
                                    'email': email.group(1).strip() if email else None,
                                    'telephone': telephone.group(1).strip() if telephone else None,
                                    'address': address.group(1).strip() if address else None,
                                    'code': code.group(1).strip() if code else None,
                                    'date_of_listing': listing_date if listing_date else None
                                }
                            sanction_list.append(sanction_dict)

                # Process lists (if entities are listed in bullet points)
                elif element.name in ['ul', 'ol']:
                    items = element.find_all('li')
                    for item in items:
                        item_text = item.get_text(strip=True)
                        if item_text:
                            # Create a dictionary for each item with the current entity type
                            sanction_dict = {
                                **document_info,
                                'entity_type': current_entity_type,
                                'name_target': item_text,
                                'alias': None,
                                'reason_txt': None,
                                'position_target': None,
                                'date_of_birth': None,
                                'place_of_birth': None,
                                'nationality': None,
                                'gender': None,
                                'passport_number': None,
                                'social_media': None,
                                'email': None,
                                'telephone': None,
                                'address': None,
                                'code': None,
                                'date_of_listing': None
                            }
                            sanction_list.append(sanction_dict)

# After processing all files, sanction_list will contain all the extracted data.
