In [None]:
import re
import pandas as pd
from typing import List, Dict, Optional
import warnings
warnings.filterwarnings('ignore')

class MerchantDataExtractor:
    def __init__(self, text: str):
        self.text = text
        self.merchants = []
        
    def extract_merchants(self) -> List[Dict]:
        """Ana extraction metodu"""
        # Merchants bölümünü bul
        merchants_section = self._extract_merchants_section()
        
        # Her şirketi parse et
        companies = self._split_into_companies(merchants_section)
        
        for company_text in companies:
            merchant_data = self._parse_company(company_text)
            if merchant_data and merchant_data.get('company_name'):
                self.merchants.append(merchant_data)
        
        return self.merchants
    
    def _extract_merchants_section(self) -> str:
        """Merchants bölümünü çıkart"""
        # "Merchants" ile başlayan ve "Suppliers" ile biten bölümü al
        pattern = r'Merchants\s+BMF Handbook.*?(?=Suppliers|Service Providers|$)'
        match = re.search(pattern, self.text, re.DOTALL)
        
        if match:
            return match.group(0)
        return self.text
    
    def _split_into_companies(self, text: str) -> List[str]:
        """Metni şirketlere böl"""
        companies = []
        
        # Şirket isimlerini bul - genellikle bold ve sol hizada
        # Core Activity'den önceki satırları şirket ismi olarak kabul et
        pattern = r'([A-Z][^\n]{10,100})\n([^\n]+(?:\n[^\n]+)*?)\nCore Activity'
        
        matches = re.finditer(pattern, text, re.MULTILINE)
        
        current_pos = 0
        for match in matches:
            if current_pos > 0:
                # Önceki şirketin metnini al
                company_text = text[current_pos:match.start()]
                companies.append(company_text)
            current_pos = match.start()
        
        # Son şirketi ekle
        if current_pos > 0:
            companies.append(text[current_pos:])
        
        return companies if companies else [text]
    
    def _parse_company(self, text: str) -> Dict:
        """Tek bir şirketi parse et"""
        data = {
            'company_name': '',
            'address': '',
            'city': '',
            'postcode': '',
            'country': '',
            'phone': '',
            'fax': '',
            'email': '',
            'website': '',
            'core_activity': '',
            'branches': []
        }
        
        lines = text.strip().split('\n')
        lines = [l.strip() for l in lines if l.strip()]
        
        if not lines:
            return data
        
        # Şirket ismini al (genellikle ilk satır)
        data['company_name'] = lines[0]
        
        # Adres bilgilerini parse et
        address_parts = []
        i = 1
        
        while i < len(lines):
            line = lines[i]
            
            # Core Activity bulundu mu?
            if line.startswith('Core Activity'):
                data['core_activity'] = line.replace('Core Activity', '').strip()
                i += 1
                continue
            
            # Branches bulundu mu?
            if line == 'Branches' or line.startswith('Branches'):
                i += 1
                # Branch bilgilerini topla
                branches = self._parse_branches(lines[i:])
                data['branches'] = branches
                break
            
            # Telefon
            if line.startswith('T '):
                data['phone'] = line.replace('T ', '').strip()
                i += 1
                continue
            
            # Fax
            if line.startswith('F '):
                data['fax'] = line.replace('F ', '').strip()
                i += 1
                continue
            
            # Email
            if line.startswith('E '):
                data['email'] = line.replace('E ', '').strip()
                i += 1
                continue
            
            # Website
            if line.startswith('W '):
                data['website'] = line.replace('W ', '').strip()
                i += 1
                continue
            
            # Adres satırı olarak kabul et
            address_parts.append(line)
            i += 1
        
        # Adres parçalarından bilgileri çıkar
        if address_parts:
            full_address = ', '.join(address_parts)
            data['address'] = full_address
            
            # Ülke (genellikle England, Scotland, Wales, Northern Ireland)
            country_pattern = r'\b(England|Scotland|Wales|Northern Ireland|Ireland)\b'
            country_match = re.search(country_pattern, full_address)
            if country_match:
                data['country'] = country_match.group(1)
            
            # Postcode (UK postcode pattern)
            postcode_pattern = r'\b([A-Z]{1,2}\d{1,2}[A-Z]?\s?\d[A-Z]{2})\b'
            postcode_match = re.search(postcode_pattern, full_address)
            if postcode_match:
                data['postcode'] = postcode_match.group(1)
            
            # Şehir (postcode'dan önceki kelime genellikle şehir)
            if data['postcode']:
                city_pattern = rf'([^,]+),\s*{re.escape(data["country"])}'
                city_match = re.search(city_pattern, full_address)
                if city_match:
                    data['city'] = city_match.group(1).strip()
        
        return data
    
    def _parse_branches(self, lines: List[str]) -> List[Dict]:
        """Branch bilgilerini parse et"""
        branches = []
        current_branch = None
        
        for line in lines:
            # Yeni bir branch başlıyor mu? (T ile başlamıyorsa)
            if not line.startswith('T ') and not line.startswith('F ') and not line.startswith('E '):
                # Önceki branch'i kaydet
                if current_branch and current_branch.get('name'):
                    branches.append(current_branch)
                
                # Yeni branch başlat
                current_branch = {
                    'name': line,
                    'phone': '',
                    'fax': '',
                    'email': ''
                }
            else:
                if current_branch:
                    if line.startswith('T '):
                        current_branch['phone'] = line.replace('T ', '').strip()
                    elif line.startswith('F '):
                        current_branch['fax'] = line.replace('F ', '').strip()
                    elif line.startswith('E '):
                        current_branch['email'] = line.replace('E ', '').strip()
        
        # Son branch'i ekle
        if current_branch and current_branch.get('name'):
            branches.append(current_branch)
        
        return branches
    
    def to_dataframe(self) -> pd.DataFrame:
        """Ana firma bilgilerini DataFrame'e çevir"""
        if not self.merchants:
            self.extract_merchants()
        
        main_data = []
        for merchant in self.merchants:
            main_data.append({
                'Company Name': merchant['company_name'],
                'Address': merchant['address'],
                'City': merchant['city'],
                'Postcode': merchant['postcode'],
                'Country': merchant['country'],
                'Phone': merchant['phone'],
                'Fax': merchant['fax'],
                'Email': merchant['email'],
                'Website': merchant['website'],
                'Core Activity': merchant['core_activity'],
                'Number of Branches': len(merchant['branches'])
            })
        
        return pd.DataFrame(main_data)
    
    def branches_to_dataframe(self) -> pd.DataFrame:
        """Branch bilgilerini ayrı bir DataFrame'e çevir"""
        if not self.merchants:
            self.extract_merchants()
        
        branches_data = []
        for merchant in self.merchants:
            company_name = merchant['company_name']
            for branch in merchant['branches']:
                branches_data.append({
                    'Company Name': company_name,
                    'Branch Name': branch['name'],
                    'Branch Phone': branch['phone'],
                    'Branch Fax': branch['fax'],
                    'Branch Email': branch['email']
                })
        
        return pd.DataFrame(branches_data)


def main():
    """Ana çalıştırma fonksiyonu"""
    
    # PDF'den çıkarttığınız metni buraya yapıştırın
    # Örnek: text = open('merchants_text.txt', 'r', encoding='utf-8').read()
    
    # Sizin verdiğiniz metni kullanalım
    text = """
    [PDF'den çıkardığınız metni buraya yapıştırın]
    """
    
    print("Veri çıkartma başlıyor...")
    
    # Extractor'ı oluştur
    extractor = MerchantDataExtractor(text)
    
    # Verileri çıkart
    merchants = extractor.extract_merchants()
    print(f"Toplam {len(merchants)} şirket bulundu.")
    
    # Ana firma bilgilerini DataFrame'e çevir
    df_main = extractor.to_dataframe()
    print(f"\nAna firma bilgileri: {len(df_main)} kayıt")
    print(df_main.head())
    
    # Branch bilgilerini DataFrame'e çevir
    df_branches = extractor.branches_to_dataframe()
    print(f"\nBranch bilgileri: {len(df_branches)} kayıt")
    print(df_branches.head())
    
    # CSV'ye kaydet
    df_main.to_csv('merchants_main.csv', index=False, encoding='utf-8-sig')
    df_branches.to_csv('merchants_branches.csv', index=False, encoding='utf-8-sig')
    
    print("\n✅ Veriler başarıyla kaydedildi!")
    print("📁 merchants_main.csv - Ana firma bilgileri")
    print("📁 merchants_branches.csv - Branch bilgileri")
    
    return df_main, df_branches


if __name__ == "__main__":
    df_main, df_branches = main()

In [4]:
import pdfplumber
import pandas as pd

def extract_from_pdf(pdf_path: str):
    """PDF'den direkt veri çıkart"""
    
    full_text = ""
    
    # PDF'i oku
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"
    
    # Extractor ile işle
    extractor = MerchantDataExtractor(full_text)
    merchants = extractor.extract_merchants()
    
    # DataFrame'lere çevir
    df_main = extractor.to_dataframe()
    df_branches = extractor.branches_to_dataframe()
    
    # Kaydet
    df_main.to_csv('merchants_main.csv', index=False, encoding='utf-8-sig')
    df_branches.to_csv('merchants_branches.csv', index=False, encoding='utf-8-sig')
    
    return df_main, df_branches

# Kullanım
df_main, df_branches = extract_from_pdf('/Users/kemalgunay/Desktop/VERI_BILIMI/PDF-SCRAPER/ISMAIL-PDF-PARSER/BMF-23-Just-Comps.pdf')

In [3]:
!pip install pdfplumber

Collecting pdfplumber
  Obtaining dependency information for pdfplumber from https://files.pythonhosted.org/packages/db/e0/52b67d4f00e09e497aec4f71bc44d395605e8ebcea52543242ed34c25ef9/pdfplumber-0.11.7-py3-none-any.whl.metadata
  Using cached pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Obtaining dependency information for pdfminer.six==20250506 from https://files.pythonhosted.org/packages/73/16/7a432c0101fa87457e75cb12c879e1749c5870a786525e2e0f42871d6462/pdfminer_six-20250506-py3-none-any.whl.metadata
  Using cached pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting Pillow>=9.1 (from pdfplumber)
  Obtaining dependency information for Pillow>=9.1 from https://files.pythonhosted.org/packages/fd/e0/ed960067543d080691d47d6938ebccbf3976a931c9567ab2fbfab983a5dd/pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Using cached pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.8 kB)
Collecti

In [6]:
import pdfplumber
import re
import math
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
from typing import List, Dict, Tuple

PDF_PATH = "/Users/kemalgunay/Desktop/VERI_BILIMI/PDF-SCRAPER/ISMAIL-PDF/BMF-23-Just-Comps.pdf"
OUTPUT_MAIN = "merchants_main.csv"
OUTPUT_BRANCHES = "merchants_branches.csv"

# -----------------------
# Yardımcı regex'ler
# -----------------------
PHONE_RE = re.compile(r'(?:T|Tel|Phone|T\.)[:\s]*(\+?\d[\d\-\s\(\)\/]{5,}\d)')
EMAIL_RE = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
WWW_RE = re.compile(r'(https?://[^\s,;]+|www\.[^\s,;]+|[^\s,;]+\.co\.uk|[^\s,;]+\.com|[^\s,;]+\.net)')
CORE_ACTIVITY_RE = re.compile(r'Core Activity[:\s\-]*([A-Za-z0-9 \&\,\-]+)', re.IGNORECASE)
BRANCHES_HEADER_RE = re.compile(r'Branches', re.IGNORECASE)

# -----------------------
# 1) PDF -> sütun bazlı metin blokları
# -----------------------
def extract_column_blocks(pdf_path: str, n_cols: int = 3) -> List[str]:
    """Her sayfada sütunlara göre sözcükleri grupla, satırları toparla, sütun sırasına göre blok listesi döndür."""
    blocks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in tqdm(pdf.pages, desc="Pages"):
            # page width
            w = page.width
            # eşit üç sütun varsayımı (pdf görselde eşit sütunlar)
            col_width = w / n_cols
            # sayfadaki tüm kelimeleri al
            words = page.extract_words(use_text_flow=True, keep_blank_chars=False)
            # her kelimeyi (x0,x1,y0,y1,text) -> sütun id'ye ata
            cols = [[] for _ in range(n_cols)]
            for wd in words:
                x0 = float(wd.get("x0", 0))
                col_idx = min(int(x0 // col_width), n_cols-1)
                cols[col_idx].append(wd)
            # her sütunda kelimeleri y koordinatına göre gruplaçizgiler oluştur
            for c in range(n_cols):
                if not cols[c]:
                    continue
                # grup by y0 (satır oluşturma)
                # satır eşiği: küçük bir delta ile aynı satır kabul et
                rows = []
                cols[c].sort(key=lambda x: (x["top"], x["x0"]))
                current_y = None
                current_line = []
                for wdict in cols[c]:
                    y = round(float(wdict["top"]), 1)
                    if current_y is None:
                        current_y = y
                        current_line = [wdict["text"]]
                    else:
                        if abs(y - current_y) <= 3:  # aynı satır
                            current_line.append(wdict["text"])
                        else:
                            rows.append(" ".join(current_line))
                            current_y = y
                            current_line = [wdict["text"]]
                if current_line:
                    rows.append(" ".join(current_line))
                # satırlardaki büyük boşluklarla (başka blok) ayır; boşluk varsa yeni blok
                # bir sayfa sütunundaki metni tek string halinde blok olarak sakla
                col_text = "\n".join(rows).strip()
                if col_text:
                    blocks.append(col_text)
    return blocks

# -----------------------
# 2) Basit kural-tabanlı ayrıştırıcı
# -----------------------
def parse_blocks_to_merchants(blocks: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Blocks listesinde sırayla gez, merchant entry'lerini tespit et.
    Basit heuristic:
      - Bir blok ya yeni şirketin başlangıcıdır (genelde başta şirket adı ve adres),
      - Eğer "Core Activity" veya "Core Activity" görünene kadar aynı şirkete ait satırlar ekle.
      - "Branches" başlığına gelince branches kısmını ayrı olarak kaydet.
    """
    merchants = []
    branches_rows = []

    current = None  # dict for merchant under construction

    def start_new_company(first_lines: List[str]):
        return {
            "raw_lines": list(first_lines),  # to be extended
            "name": None,
            "address": None,
            "phones": [],
            "emails": [],
            "websites": [],
            "core_activity": None,
            "branches_text": ""
        }

    for blk in tqdm(blocks, desc="Parsing blocks"):
        # Basit temizleme
        text = blk.strip()
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        if not lines:
            continue

        # Heuristics: eğer blok çok kısa ve "Branches" başlığı içeriyorsa branch ekle
        if any(BRANCHES_HEADER_RE.search(ln) for ln in lines):
            # Eğer daha önceden current yoksa (nadiren) yeni bir entry başlat
            if current is None:
                current = start_new_company(lines)
            # branches kısmını append et
            current["branches_text"] += ("\n" + "\n".join(lines))
            # branches'i daha sonra parse edeceğiz
            continue

        # Eğer blok başında ve büyük ihtimalle yeni şirket: satır 0 büyük harfle başlıyorsa (isim) -> yeni şirket
        # Bu heuristic yanlış pozitif olabilir; LLM adımı ile düzeltme önerdim.
        likely_new_company = False
        first_line = lines[0]
        # eğer satır büyük bir kelime (Caps veya Baş Harf) ve kısa ise isim olabilir
        if len(first_line.split()) <= 7 and re.search(r'[A-Za-z0-9]', first_line):
            likely_new_company = True

        if current is None:
            # yeni şirket başlat
            current = start_new_company(lines)
            continue

        # Eğer mevcut entry varsa, fakat bu blok yeni şirkete benziyorsa mevcut entry'yi finalize edip yeniye başla
        if likely_new_company and len(current["raw_lines"]) > 0 and not any(CORE_ACTIVITY_RE.search(l) for l in current["raw_lines"]):
            # finalize current
            merchants.append(current)
            current = start_new_company(lines)
            continue

        # Aksi takdirde mevcut entry'e ekle
        current["raw_lines"].extend(lines)

    # döngü bitince kalanı ekle
    if current is not None:
        merchants.append(current)

    # Şimdi her merchant'ın ham satırlarından alanları çıkar
    parsed = []
    for m in merchants:
        text_all = "\n".join(m["raw_lines"])
        phones = PHONE_RE.findall(text_all)
        emails = EMAIL_RE.findall(text_all)
        wws = WWW_RE.findall(text_all)
        # www regex döndürdüğü şeyler karmaşık olabilir; uniq yap
        websites = sorted(set([w.strip() for w in wws]))
        core = None
        ca_match = CORE_ACTIVITY_RE.search(text_all)
        if ca_match:
            core = ca_match.group(1).strip()
        # name: ham satırların ilk satırı olma eğiliminde
        name_guess = m["raw_lines"][0].strip() if m["raw_lines"] else None
        # address guess: ilk 2-3 satırı name'den sonra gelen satırlar arasından telefon/email/web olmayanları al
        addr_lines = []
        for ln in m["raw_lines"][1:6]:
            if PHONE_RE.search(ln) or EMAIL_RE.search(ln) or WWW_RE.search(ln) or CORE_ACTIVITY_RE.search(ln) or BRANCHES_HEADER_RE.search(ln):
                continue
            addr_lines.append(ln)
        address_guess = ", ".join(addr_lines).strip() if addr_lines else None

        # branches parsing: eğer branches_text varsa, satır satır parse et; her branch satırı içinde T veya telefon olabilir
        branches = []
        if m["branches_text"]:
            b_lines = [ln for ln in m["branches_text"].splitlines() if ln.strip() and not BRANCHES_HEADER_RE.search(ln)]
            # her branch satırını daha küçük alanlara böl ve telefon/email ayıkla
            for bl in b_lines:
                bphones = PHONE_RE.findall(bl)
                bemails = EMAIL_RE.findall(bl)
                bwws = WWW_RE.findall(bl)
                branches.append({
                    "raw": bl,
                    "phones": bphones,
                    "emails": bemails,
                    "websites": sorted(set(bwws))
                })

        parsed.append({
            "name": name_guess,
            "address": address_guess,
            "phones": ";".join(sorted(set(phones))),
            "emails": ";".join(sorted(set(emails))),
            "websites": ";".join(websites),
            "core_activity": core,
            "raw_text": text_all,
            "branches_struct": branches
        })

    # DataFrame oluştur
    df_main = pd.DataFrame([{
        "name": p["name"],
        "address": p["address"],
        "phones": p["phones"],
        "emails": p["emails"],
        "websites": p["websites"],
        "core_activity": p["core_activity"],
        "raw_text": p["raw_text"]
    } for p in parsed])

    # branches DataFrame: flatten
    branches_flat = []
    for idx, p in enumerate(parsed):
        for b in p["branches_struct"]:
            branches_flat.append({
                "merchant_idx": idx,
                "merchant_name": p["name"],
                "branch_raw": b["raw"],
                "branch_phones": ";".join(b["phones"]),
                "branch_emails": ";".join(b["emails"]),
                "branch_websites": ";".join(b["websites"])
            })
    df_br = pd.DataFrame(branches_flat)

    return df_main, df_br

# -----------------------
# 3) Pipeline çalıştırma
# -----------------------
if __name__ == "__main__":
    blocks = extract_column_blocks(PDF_PATH, n_cols=3)
    print(f"Extracted {len(blocks)} column-blocks.")
    df_main, df_branches = parse_blocks_to_merchants(blocks)
    print("Parsed merchants:", len(df_main))
    print("Parsed branches rows:", len(df_branches))
    df_main.to_csv(OUTPUT_MAIN, index=False, encoding="utf-8-sig")
    df_branches.to_csv(OUTPUT_BRANCHES, index=False, encoding="utf-8-sig")
    print("Saved CSV files.")


Pages: 100%|██████████| 119/119 [00:12<00:00,  9.30it/s]


Extracted 332 column-blocks.


Parsing blocks: 100%|██████████| 332/332 [00:00<00:00, 46900.03it/s]


Parsed merchants: 232
Parsed branches rows: 5122
Saved CSV files.


In [None]:
import re
import pandas as pd
from typing import List, Dict, Tuple

def parse_bmf_data(text: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Parse BMF Handbook company data into two DataFrames:
    1. Companies DataFrame with main company info
    2. Branches DataFrame with branch details
    """
    
    companies = []
    branches = []
    
    # Split text into company blocks
    # Companies typically start with a name followed by address and contact info
    lines = text.split('\n')
    
    current_company = None
    in_branches = False
    branch_lines = []
    
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Skip empty lines and page headers/footers
        if not line or 'BMF Handbook' in line or line.isdigit():
            i += 1
            continue
        
        # Check if this is a "Branches" header
        if line == 'Branches':
            in_branches = True
            i += 1
            continue
        
        # Check if this is a "Core Activity" line
        if line.startswith('Core Activity'):
            if current_company:
                core_activity = line.replace('Core Activity', '').strip()
                current_company['Core Activity'] = core_activity
            i += 1
            continue
        
        # Check if line starts with contact info (T, F, E, W)
        if re.match(r'^[TFEW]\s', line):
            if current_company:
                contact_type = line[0]
                contact_value = line[1:].strip()
                
                if in_branches:
                    # This might be a branch phone or additional company contact
                    # Check if previous line was a branch name
                    if branch_lines:
                        last_branch = branch_lines[-1]
                        if 'phone' not in last_branch:
                            last_branch['phone'] = contact_value
                else:
                    # Main company contact
                    if contact_type == 'T':
                        phone_num = 1
                        while f'Phone {phone_num}' in current_company:
                            phone_num += 1
                        current_company[f'Phone {phone_num}'] = contact_value
                    elif contact_type == 'F':
                        current_company['Fax'] = contact_value
                    elif contact_type == 'E':
                        email_num = 1
                        while f'Email {email_num}' in current_company:
                            email_num += 1
                        current_company[f'Email {email_num}'] = contact_value
                    elif contact_type == 'W':
                        web_num = 1
                        while f'Website {web_num}' in current_company:
                            web_num += 1
                        current_company[f'Website {web_num}'] = contact_value
            i += 1
            continue
        
        # If we're in branches section and line doesn't start with T/F/E/W
        if in_branches and line:
            # Check if this might be a branch name with phone on same line
            # Format: "Branch Name T 01234 567890" or just "Branch Name"
            match = re.match(r'^(.+?)(?:\s+T\s+(.+))?$', line)
            if match:
                branch_name = match.group(1).strip()
                branch_phone = match.group(2).strip() if match.group(2) else ''
                
                # Skip if this looks like it's starting a new company
                # (usually has more structured address format)
                if not re.match(r'^[A-Z][a-z]+$', branch_name) or len(branch_name) > 30:
                    branch_lines.append({
                        'name': branch_name,
                        'phone': branch_phone
                    })
            i += 1
            continue
        
        # Check if this is a new company (heuristic: line without T/F/E/W prefix)
        # and not in branches section
        if not in_branches and line and not re.match(r'^[TFEW]\s', line):
            # Save previous company and its branches
            if current_company:
                companies.append(current_company)
                
                # Add branches
                for branch in branch_lines:
                    branches.append({
                        'Parent Company': current_company['Company Name'],
                        'Branch Name': branch['name'],
                        'Branch Phone': branch['phone']
                    })
            
            # Start new company
            current_company = {
                'Company Name': line,
                'Address': '',
                'Core Activity': ''
            }
            branch_lines = []
            in_branches = False
            
            # Try to get address from next lines
            j = i + 1
            address_lines = []
            while j < len(lines):
                next_line = lines[j].strip()
                if not next_line or re.match(r'^[TFEW]\s', next_line) or next_line == 'Branches' or next_line.startswith('Core Activity'):
                    break
                address_lines.append(next_line)
                j += 1
            
            if address_lines:
                current_company['Address'] = ', '.join(address_lines)
        
        i += 1
    
    # Don't forget the last company
    if current_company:
        companies.append(current_company)
        for branch in branch_lines:
            branches.append({
                'Parent Company': current_company['Company Name'],
                'Branch Name': branch['name'],
                'Branch Phone': branch['phone']
            })
    
    # Create DataFrames
    companies_df = pd.DataFrame(companies)
    branches_df = pd.DataFrame(branches)
    
    return companies_df, branches_df


def parse_bmf_improved(text: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Improved parser that better handles the multi-column layout
    """
    companies = []
    branches = []
    
    lines = [l.strip() for l in text.split('\n') if l.strip()]
    
    i = 0
    current_company = None
    in_branches = False
    temp_branches = []
    
    while i < len(lines):
        line = lines[i]
        
        # Skip page markers
        if 'BMF Handbook' in line or (line.isdigit() and len(line) <= 3):
            i += 1
            continue
        
        # Detect "Branches" section
        if line == 'Branches':
            in_branches = True
            i += 1
            continue
        
        # Detect "Core Activity"
        if line.startswith('Core Activity'):
            if current_company:
                activity = line.replace('Core Activity', '').strip()
                current_company['Core Activity'] = activity
            in_branches = False  # After Core Activity, branches section ends
            i += 1
            continue
        
        # Detect contact lines (T, F, E, W)
        if re.match(r'^[TFEW]\s+', line):
            contact_type = line[0]
            value = line[2:].strip()
            
            if current_company:
                if contact_type == 'T':
                    n = 1
                    while f'Phone {n}' in current_company:
                        n += 1
                    current_company[f'Phone {n}'] = value
                elif contact_type == 'F':
                    current_company['Fax'] = value
                elif contact_type == 'E':
                    n = 1
                    while f'Email {n}' in current_company:
                        n += 1
                    current_company[f'Email {n}'] = value
                elif contact_type == 'W':
                    n = 1
                    while f'Website {n}' in current_company:
                        n += 1
                    current_company[f'Website {n}'] = value
            i += 1
            continue
        
        # If in branches section, parse branch entries
        if in_branches and current_company:
            # Branch format: "Name" or "Name T phone"
            if re.match(r'^T\s+\d', line):
                # Phone for previous branch
                if temp_branches and 'phone' not in temp_branches[-1]:
                    temp_branches[-1]['phone'] = line[2:].strip()
            else:
                # New branch name
                parts = line.split(' T ')
                if len(parts) == 2:
                    temp_branches.append({'name': parts[0].strip(), 'phone': parts[1].strip()})
                else:
                    temp_branches.append({'name': line, 'phone': ''})
            i += 1
            continue
        
        # Detect new company (not starting with contact prefix, not "Branches" or "Core Activity")
        if not re.match(r'^[TFEW]\s+', line) and line not in ['Branches', 'Core Activity']:
            # Save previous company
            if current_company:
                companies.append(current_company)
                for br in temp_branches:
                    branches.append({
                        'Parent Company': current_company['Company Name'],
                        'Branch Name': br['name'],
                        'Branch Phone': br['phone']
                    })
                temp_branches = []
            
            # Start new company
            current_company = {
                'Company Name': line,
                'Address': '',
                'Core Activity': ''
            }
            in_branches = False
            
            # Collect address lines
            addr_lines = []
            j = i + 1
            while j < len(lines):
                next_line = lines[j]
                if (re.match(r'^[TFEW]\s+', next_line) or 
                    next_line in ['Branches', 'Core Activity'] or
                    (not re.match(r'^[TFEW]\s+', next_line) and 
                     next_line not in ['Branches', 'Core Activity'] and
                     j > i + 3)):  # Assume max 3 address lines
                    break
                if next_line and not next_line.startswith('BMF'):
                    addr_lines.append(next_line)
                j += 1
            
            current_company['Address'] = ', '.join(addr_lines[:3])  # Take max 3 lines
        
        i += 1
    
    # Save last company
    if current_company:
        companies.append(current_company)
        for br in temp_branches:
            branches.append({
                'Parent Company': current_company['Company Name'],
                'Branch Name': br['name'],
                'Branch Phone': br['phone']
            })
    
    companies_df = pd.DataFrame(companies)
    branches_df = pd.DataFrame(branches)
    
    return companies_df, branches_df


# Example usage
if __name__ == "__main__":
    # Test data
    test_data = """
AE Spink Ltd
Kelham Street, Doncaster, England DN1 3RA
T 01302 321514
F 01302 327543
E info@aespink.com
W www.aespink.com
Core Activity General Merchant
Branches
Doncaster Building Supplies
T 01302 554238
Doncaster Plumbing & Heating
T 01302 554254
Leeds Building Supplies
T 0113 8591122
Leeds Plumbing & Heating
T 0113 8592211
Sheffield Plumbing & Heating
T 0114 3990905
Worksop Plumbing & Heating
T 01909 484884

Allneeds Building & Construction Ltd
T/A ABC Depot 248 Regents Park Road, Finchley, England N3 3HN
T 0203 1515222
E prax.patel@abcdepot.co.uk
W www.abcdepot.co.uk
Core Activity General Merchant
Branches
Finchley
T 020 8349 9987
Hatfield
T 020 3151 5222

Alsford
Ness Road, Erith, England DA8 2LD
T 01322 333 088
E enquiries@alsfordtimber.com
W www.alsfordtimber.com
Core Activity General Merchant
Branches
Beckenham
T 020 8655 3939
Brighton
T 01273 554 888
Cobham
T 01932 863 468
"""
    
    companies_df, branches_df = parse_bmf_improved(test_data)
    
    print("=" * 80)
    print("COMPANIES")
    print("=" * 80)
    print(companies_df.to_string(index=False))
    
    print("\n" + "=" * 80)
    print("BRANCHES")
    print("=" * 80)
    print(branches_df.to_string(index=False))
    
    # Save to Excel
    with pd.ExcelWriter('bmf_handbook_data.xlsx', engine='openpyxl') as writer:
        companies_df.to_excel(writer, sheet_name='Companies', index=False)
        branches_df.to_excel(writer, sheet_name='Branches', index=False)
    
    print("\n✅ Data exported to 'bmf_handbook_data.xlsx'")