In [None]:
import re

# Extract gymfinder URLs from the crawled data
def extract_gymfinder_urls(crawl_result):
    """
    Extract gymfinder.ae URLs from FireCrawl result data
    Returns a list of URLs without brackets
    """
    urls = []
    
    # Check if we have data
    if hasattr(crawl_result, 'data', pattern) and crawl_result.data:
        for document in crawl_result.data:
            if hasattr(document, 'markdown') and document.markdown:
                found_urls = re.findall(pattern, document.markdown)
                urls.extend(found_urls)
    
    # Remove duplicates while preserving order
    unique_urls = list(dict.fromkeys(urls))
    return unique_urls

In [1]:
import asyncio
from firecrawl import AsyncFirecrawlApp, ScrapeOptions

async def crawl(url):
    app = AsyncFirecrawlApp(api_key='fc-0f3f45b2c5ed43dcb40b13c439f7eae5')
    
    # For crawling multiple pages with limit
    response = await app.crawl_url(
        url=url,
        limit=10,
        scrape_options=ScrapeOptions(
            formats=['markdown'],
            only_main_content=True
        )
    )
    return response

# For Jupyter notebooks, use await directly instead of asyncio.run()
# Regex pattern to extract URLs from the crawled data
# pattern = r'https://tax\.gov\.ae/en/tax\.support/tax\.agents/[^)]+/'
# url = 'https://tax.gov.ae/en/tax.support/tax.agents/registered.tax.agents.aspx'
# result = await crawl(url)

# # Extract URLs from our crawl result
# extracted_urls = extract_gymfinder_urls(result)

# print(f"Found {len(extracted_urls)} unique gym URLs:")
# for i, url in enumerate(extracted_urls, 1):
#     print(f"{i}. {url}")

In [None]:
result = await crawl('https://tax.gov.ae/en/tax.support/tax.agents/registered.tax.agents.aspx')

In [None]:
for data in result.data:
    print(data.markdown)

In [12]:
import re
import csv
import pandas as pd
from datetime import datetime
import os

def extract_member_records_from_markdown(markdown_text):
    """
    Extract establishment member records from the markdown text
    Returns a list of dictionaries containing member and establishment info
    """
    records = []
    
    # Pattern to match TAAN sections (Tax Agent Registration Number)
    taan_pattern = r'### TAAN: (\d+)'
    
    # Split the markdown by TAAN sections
    taan_matches = list(re.finditer(taan_pattern, markdown_text))
    
    for i, match in enumerate(taan_matches):
        taan_number = match.group(1)
        start_pos = match.start()
        
        # Find the end position (start of next TAAN or end of text)
        if i + 1 < len(taan_matches):
            end_pos = taan_matches[i + 1].start()
        else:
            end_pos = len(markdown_text)
        
        section = markdown_text[start_pos:end_pos]
        
        # Extract establishment name (first line after TAAN that's not empty)
        lines = section.split('\n')
        establishment_name = ""
        for line in lines[3:]:  # Skip TAAN line and empty lines
            line = line.strip()
            if line and not line.startswith('#') and not line.startswith('-') and not line.startswith('['):
                establishment_name = line
                break
        
        # Extract location (usually comes after establishment name)
        location = ""
        found_name = False
        for line in lines:
            line = line.strip()
            if line == establishment_name:
                found_name = True
                continue
            if found_name and line and not line.startswith('#') and not line.startswith('-') and not line.startswith('[') and not line.startswith('Previous Experience'):
                location = line
                break
        
        # Extract website
        website_pattern = r'website\.svg\)([^\\n]+)'
        website_match = re.search(website_pattern, section)
        website = website_match.group(1) if website_match else ""
        
        # Extract emails
        email_pattern = r'\[([^@\]]+@[^@\]]+\.[^@\]]+)\]'
        emails = re.findall(email_pattern, section)
        
        # Extract phone numbers
        phone_pattern = r'\[(\+\d+)\]'
        phones = re.findall(phone_pattern, section)
        
        # Create records for each email/phone combination
        if emails or phones:
            # If we have both emails and phones, create combinations
            if emails and phones:
                for email in emails:
                    for phone in phones:
                        records.append({
                            'taan_number': taan_number,
                            'establishment_name': establishment_name,
                            'location': location,
                            'website': website,
                            'email': email,
                            'phone': phone,
                            'extraction_timestamp': datetime.now().isoformat()
                        })
            # If only emails, create records for each email
            elif emails:
                for email in emails:
                    records.append({
                        'taan_number': taan_number,
                        'establishment_name': establishment_name,
                        'location': location,
                        'website': website,
                        'email': email,
                        'phone': '',
                        'extraction_timestamp': datetime.now().isoformat()
                    })
            # If only phones, create records for each phone
            elif phones:
                for phone in phones:
                    records.append({
                        'taan_number': taan_number,
                        'establishment_name': establishment_name,
                        'location': location,
                        'website': website,
                        'email': '',
                        'phone': phone,
                        'extraction_timestamp': datetime.now().isoformat()
                    })
        else:
            # Create a record even if no contact info
            records.append({
                'taan_number': taan_number,
                'establishment_name': establishment_name,
                'location': location,
                'website': website,
                'email': '',
                'phone': '',
                'extraction_timestamp': datetime.now().isoformat()
            })
    
    return records

def append_to_csv(records, filename='establishment_members.csv'):
    """
    Append records to CSV file, creating it if it doesn't exist
    """
    fieldnames = ['taan_number', 'establishment_name', 'location', 'website', 'email', 'phone', 'extraction_timestamp']
    
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header if file is new
        if not file_exists:
            writer.writeheader()
        
        # Write records
        for record in records:
            writer.writerow(record)
    
    print(f"Appended {len(records)} records to {filename}")
    return filename

def process_crawl_result(crawl_result, csv_filename='establishment_members.csv'):
    """
    Process the crawl result and extract all member records to CSV
    """
    all_records = []
    
    if hasattr(crawl_result, 'data') and crawl_result.data:
        for document in crawl_result.data:
            if hasattr(document, 'markdown') and document.markdown:
                records = extract_member_records_from_markdown(document.markdown)
                all_records.extend(records)
    
    if all_records:
        csv_file = append_to_csv(all_records, csv_filename)
        print(f"Total records extracted: {len(all_records)}")
        return csv_file, all_records
    else:
        print("No records found in the crawl result")
        return None, []

# Test the extraction function with our result
print("Processing crawl result and extracting member records...")


Processing crawl result and extracting member records...


In [13]:
# Process the crawl result and create CSV
csv_file, extracted_records = process_crawl_result(result, 'establishment_members.csv')

if extracted_records:
    print(f"\nSuccessfully extracted {len(extracted_records)} records")
    print(f"CSV file created/updated: {csv_file}")
    
    # Show first few records as preview
    print("\nFirst 3 records preview:")
    for i, record in enumerate(extracted_records[:3]):
        print(f"\nRecord {i+1}:")
        for key, value in record.items():
            print(f"  {key}: {value}")
    
    # Show summary statistics
    total_establishments = len(set(record['establishment_name'] for record in extracted_records))
    total_emails = len([r for r in extracted_records if r['email']])
    total_phones = len([r for r in extracted_records if r['phone']])
    
    print(f"\nSummary:")
    print(f"- Total establishments: {total_establishments}")
    print(f"- Records with emails: {total_emails}")
    print(f"- Records with phone numbers: {total_phones}")
else:
    print("No records were extracted!")


Appended 24 records to establishment_members.csv
Total records extracted: 24

Successfully extracted 24 records
CSV file created/updated: establishment_members.csv

First 3 records preview:

Record 1:
  taan_number: 20035861
  establishment_name: Kmj Tax Consultant
  location: Dubai
  website: http://www.kmjtaxuae.com
- ![](https://tax.gov.ae/e
  email: taxagency.kmj@gmail.comm
  phone: +971504200396
  extraction_timestamp: 2025-08-19T15:42:30.222854

Record 2:
  taan_number: 20035861
  establishment_name: Kmj Tax Consultant
  location: Dubai
  website: http://www.kmjtaxuae.com
- ![](https://tax.gov.ae/e
  email: olgishann@gmail.com
  phone: +971504200396
  extraction_timestamp: 2025-08-19T15:42:30.222863

Record 3:
  taan_number: 20013751
  establishment_name: XB4 - DUBAI BRANCH
  location: All Emirates
  website: http://www.xb4.com
- ![](https://tax.gov.ae/e
  email: contact@xb4.com
  phone: +971508161350
  extraction_timestamp: 2025-08-19T15:42:30.222881

Summary:
- Total establishm

In [None]:
# Utility functions for CSV management and incremental updates

def view_csv_contents(filename='establishment_members.csv', max_rows=10):
    """View contents of the CSV file"""
    try:
        df = pd.read_csv(filename)
        print(f"CSV file: {filename}")
        print(f"Total rows: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        print(f"\nFirst {min(max_rows, len(df))} rows:")
        print(df.head(max_rows).to_string(index=False))
        return df
    except FileNotFoundError:
        print(f"CSV file {filename} not found")
        return None

def add_single_record(taan_number, establishment_name, location, website, email, phone, filename='establishment_members.csv'):
    """Add a single record to the CSV"""
    record = {
        'taan_number': taan_number,
        'establishment_name': establishment_name,
        'location': location,
        'website': website,
        'email': email,
        'phone': phone,
        'extraction_timestamp': datetime.now().isoformat()
    }
    
    append_to_csv([record], filename)
    print(f"Added record for {establishment_name}")
    return record

def remove_duplicates_from_csv(filename='establishment_members.csv'):
    """Remove duplicate records from CSV file"""
    try:
        df = pd.read_csv(filename)
        original_count = len(df)
        
        # Remove duplicates based on taan_number, email, and phone
        df_unique = df.drop_duplicates(subset=['taan_number', 'email', 'phone'], keep='first')
        
        # Save back to CSV
        df_unique.to_csv(filename, index=False)
        
        new_count = len(df_unique)
        removed_count = original_count - new_count
        
        print(f"Removed {removed_count} duplicate records")
        print(f"File now has {new_count} unique records")
        
        return df_unique
    except FileNotFoundError:
        print(f"CSV file {filename} not found")
        return None

# Example usage:
# view_csv_contents()
print("CSV management utilities loaded. Available functions:")
print("- view_csv_contents(): View the CSV file contents")
print("- add_single_record(): Add a manual record")
print("- remove_duplicates_from_csv(): Remove duplicate records")
print("- process_crawl_result(): Process new crawl results")


In [None]:
# Test the extraction and view results
print("Testing extraction and viewing CSV contents...")
print("=" * 50)

# View the CSV contents
df = view_csv_contents('establishment_members.csv', max_rows=5)

if df is not None:
    print("\n" + "=" * 50)
    print("DETAILED ANALYSIS:")
    print("=" * 50)
    
    # Show establishments with multiple contacts
    print("\nEstablishments with multiple contact methods:")
    establishments = df.groupby('establishment_name').agg({
        'email': lambda x: len([e for e in x if e]), 
        'phone': lambda x: len([p for p in x if p]),
        'taan_number': 'first',
        'location': 'first',
        'website': 'first'
    }).reset_index()
    
    multi_contact = establishments[(establishments['email'] > 1) | (establishments['phone'] > 1)]
    if not multi_contact.empty:
        print(multi_contact.to_string(index=False))
    else:
        print("No establishments with multiple contacts found")
    
    # Show unique locations
    print(f"\nUnique locations found:")
    unique_locations = df['location'].unique()
    for loc in unique_locations:
        if loc:  # Only show non-empty locations
            count = len(df[df['location'] == loc])
            print(f"  - {loc}: {count} records")
    
    print(f"\nTotal unique emails: {len(df[df['email'] != '']['email'].unique())}")
    print(f"Total unique phone numbers: {len(df[df['phone'] != '']['phone'].unique())}")
    
else:
    print("\nNo CSV file found. The extraction might not have run successfully.")
    print("Try running the previous cells first.")


# Establishment Members CSV Extraction System

## Overview
This notebook extracts establishment member details from crawled tax agent data and exports them to a CSV file with incremental updates.

## Features
- **Automatic Data Extraction**: Parses markdown content to extract member records
- **Multiple Contact Handling**: Creates separate records for each email/phone combination per establishment
- **Incremental CSV Updates**: Appends new records without overwriting existing data
- **Duplicate Management**: Tools to identify and remove duplicate entries
- **Comprehensive Data Structure**: Includes TAAN number, establishment name, location, website, emails, and phone numbers

## CSV Structure
Each row in the CSV represents a member contact record with the following columns:
- `taan_number`: Tax Agent Registration Number
- `establishment_name`: Name of the establishment
- `location`: Physical location/city
- `website`: Official website URL
- `email`: Contact email address
- `phone`: Contact phone number
- `extraction_timestamp`: When the record was extracted

## Usage Instructions

### 1. Initial Extraction
Run the crawling cells (1-3) to get the data, then run cell 5 to extract all records to CSV.

### 2. Incremental Updates
To process new crawl results and append to existing CSV:
```python
# After crawling new data
new_csv_file, new_records = process_crawl_result(new_result, 'establishment_members.csv')
```

### 3. Manual Record Addition
To add a single record manually:
```python
add_single_record(
    taan_number="12345678",
    establishment_name="Example Company",
    location="Dubai",
    website="http://example.com",
    email="contact@example.com",
    phone="+971501234567"
)
```

### 4. CSV Management
```python
# View CSV contents
view_csv_contents()

# Remove duplicates
remove_duplicates_from_csv()
```

## Data Handling Notes
- **Multiple Contacts**: When an establishment has multiple emails or phone numbers, separate records are created for each combination
- **Missing Data**: Records are created even when some contact information is missing
- **Timestamps**: Each record includes when it was extracted for tracking purposes
- **Encoding**: UTF-8 encoding ensures proper handling of special characters

## File Output
- **Default filename**: `establishment_members.csv`
- **Location**: Same directory as this notebook
- **Format**: Standard CSV with headers
- **Append Mode**: New extractions are added to existing file without overwriting
