

# Email & URL Validation Tool

## Overview

This tool validates email addresses and website URLs in business contact datasets to improve data quality and deliverability. It outputs a cleaned dataset containing only verified, working contacts.

## Features

### Email Validation

* Validates email format and domain structure
* Verifies DNS MX records to ensure email deliverability

### URL Validation

* Checks URL syntax and structure
* Confirms HTTP status (200–399)
* Verifies website accessibility

### Data Processing

* Uses multi-threading (10 workers) for efficiency
* Handles timeouts, connection failures, and malformed data
* Retains original dataset structure while removing invalid entries

## Input Requirements

* Input file: CSV
* Required columns: `channel_email`, `channel_website`, `organization_name`
* Supports missing or inconsistent data

## Output

* `cleaned_manager_jobs.csv`: Filtered dataset with only valid contacts
* `validation_results.csv`: Detailed results for each email and URL




In [10]:
!pip install dnspython




In [11]:
!pip install validators




In [16]:
import pandas as pd
import httpx
import re
import dns.resolver
from concurrent.futures import ThreadPoolExecutor, as_completed
import validators
import time
from typing import Dict, List, Optional

class EmailURLValidator:
    def __init__(self, max_workers: int = 10, timeout: int = 15):
        self.max_workers = max_workers
        self.timeout = timeout
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

    def validate_email_format(self, email: str) -> bool:
        """Basic email format validation"""
        email_regex = r'^[^\s@]+@[^\s@]+\.[^\s@]+$'
        return bool(re.match(email_regex, email.strip()))

    def check_mx_record(self, domain: str) -> bool:
        """Check if domain has MX record"""
        try:
            dns.resolver.resolve(domain, 'MX')
            return True
        except:
            return False

    def validate_email(self, email) -> Dict:
        """Comprehensive email validation"""
        if pd.isna(email) or not str(email).strip():
            return {'status': 'empty', 'error': 'Empty email'}

        email = str(email).strip()

        if not self.validate_email_format(email):
            return {'status': 'invalid', 'error': 'Invalid format'}

        try:
            domain = email.split('@')[1].lower()
            if self.check_mx_record(domain):
                return {'status': 'valid', 'error': ''}
            else:
                return {'status': 'no_mx', 'error': 'No MX record'}
        except:
            return {'status': 'invalid', 'error': 'Domain error'}

    def check_url_status(self, url) -> Dict:
        """Check URL HTTP status"""
        if pd.isna(url) or not str(url).strip():
            return {'accessible': False, 'error': 'Empty URL'}

        url = str(url).strip()

        if not validators.url(url):
            return {'accessible': False, 'error': 'Invalid URL format'}

        try:
            with httpx.Client(timeout=self.timeout, follow_redirects=True, headers=self.headers) as client:
                response = client.get(url)
                if response.status_code < 400:
                    return {'accessible': True, 'error': ''}
                else:
                    return {'accessible': False, 'error': f'HTTP {response.status_code}'}
        except:
            return {'accessible': False, 'error': 'Connection failed'}

    def process_single_row(self, row_index, row_data):
        """Process a single row"""
        result = {
            'row_index': row_index,
            'organization_name': str(row_data.get('organization_name', '')),
            'channel_email': str(row_data.get('channel_email', '')),
            'channel_website': str(row_data.get('channel_website', '')),
            'email_valid': False,
            'url_accessible': False,
            'email_error': '',
            'url_error': ''
        }

        # Test email
        if result['channel_email'] and result['channel_email'] != 'nan':
            email_result = self.validate_email(result['channel_email'])
            result['email_valid'] = email_result['status'] == 'valid'
            result['email_error'] = email_result['error']

        # Test URL
        if result['channel_website'] and result['channel_website'] != 'nan':
            url_result = self.check_url_status(result['channel_website'])
            result['url_accessible'] = url_result['accessible']
            result['url_error'] = url_result['error']

        return result


def main():
    INPUT_CSV = "manager_jobs_rows.csv"
    CLEANED_CSV = "cleaned_manager_jobs.csv"
    RESULTS_CSV = "validation_results.csv"

    print("🚀 Starting Email & URL Validation")
    print("=" * 50)

    # Load dataset
    try:
        df = pd.read_csv(INPUT_CSV)
        print(f"✅ Loaded dataset with {len(df)} rows")
    except Exception as e:
        print(f"❌ Error reading CSV: {e}")
        return

    # Initialize validator
    validator = EmailURLValidator(max_workers=10, timeout=10)

    # Process all rows
    print("🔍 Processing rows...")
    results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all rows for processing
        futures = []
        for idx, row in df.iterrows():
            future = executor.submit(validator.process_single_row, idx, row)
            futures.append(future)

        # Collect results
        for i, future in enumerate(as_completed(futures)):
            try:
                result = future.result()
                results.append(result)

                if (i + 1) % 100 == 0:
                    print(f"Processed {i + 1}/{len(futures)} rows...")
            except Exception as e:
                print(f"Error processing row: {e}")

    print(f"✅ Processed {len(results)} rows")

    # Create results DataFrame
    results_df = pd.DataFrame(results)
    results_df.to_csv(RESULTS_CSV, index=False)
    print(f"📊 Results saved to: {RESULTS_CSV}")

    # Create cleaned dataset
    print("🧹 Creating cleaned dataset...")

    # Strict approach: remove rows if email is invalid OR URL is broken
    cleaned_rows = []
    removed_count = 0

    for idx, row in df.iterrows():
        # Find validation result for this row
        result = next((r for r in results if r['row_index'] == idx), None)

        if result:
            remove_row = False

            # Remove if email exists but is invalid
            if result['channel_email'] and result['channel_email'] != 'nan' and not result['email_valid']:
                remove_row = True

            # Remove if URL exists but is not accessible
            if result['channel_website'] and result['channel_website'] != 'nan' and not result['url_accessible']:
                remove_row = True

            if not remove_row:
                cleaned_rows.append(row)
            else:
                removed_count += 1
        else:
            # If no validation was performed, keep the row
            cleaned_rows.append(row)

    # Save cleaned dataset
    cleaned_df = pd.DataFrame(cleaned_rows)
    cleaned_df.to_csv(CLEANED_CSV, index=False)

    # Print summary
    print("=" * 50)
    print(f"📈 CLEANING SUMMARY:")
    print(f"Original dataset: {len(df)} rows")
    print(f"Cleaned dataset: {len(cleaned_df)} rows")
    print(f"Removed: {removed_count} rows")

    # Email summary
    valid_emails = sum(1 for r in results if r['email_valid'])
    total_emails = sum(1 for r in results if r['channel_email'] and r['channel_email'] != 'nan')
    print(f"\n📧 EMAIL SUMMARY:")
    print(f"Total emails tested: {total_emails}")
    print(f"Valid emails: {valid_emails}")
    print(f"Invalid emails: {total_emails - valid_emails}")

    # URL summary
    accessible_urls = sum(1 for r in results if r['url_accessible'])
    total_urls = sum(1 for r in results if r['channel_website'] and r['channel_website'] != 'nan')
    print(f"\n🌐 URL SUMMARY:")
    print(f"Total URLs tested: {total_urls}")
    print(f"Accessible URLs: {accessible_urls}")
    print(f"Inaccessible URLs: {total_urls - accessible_urls}")

    print(f"\n🎉 Cleaned dataset saved to: {CLEANED_CSV}")


if __name__ == "__main__":
    main()

🚀 Starting Email & URL Validation
✅ Loaded dataset with 10170 rows
🔍 Processing rows...
Processed 100/10170 rows...
Processed 200/10170 rows...
Processed 300/10170 rows...
Processed 400/10170 rows...
Processed 500/10170 rows...
Processed 600/10170 rows...
Processed 700/10170 rows...
Processed 800/10170 rows...
Processed 900/10170 rows...
Processed 1000/10170 rows...
Processed 1100/10170 rows...
Processed 1200/10170 rows...
Processed 1300/10170 rows...
Processed 1400/10170 rows...
Processed 1500/10170 rows...
Processed 1600/10170 rows...
Processed 1700/10170 rows...
Processed 1800/10170 rows...
Processed 1900/10170 rows...
Processed 2000/10170 rows...
Processed 2100/10170 rows...
Processed 2200/10170 rows...
Processed 2300/10170 rows...
Processed 2400/10170 rows...
Processed 2500/10170 rows...
Processed 2600/10170 rows...
Processed 2700/10170 rows...
Processed 2800/10170 rows...
Processed 2900/10170 rows...
Processed 3000/10170 rows...
Processed 3100/10170 rows...
Processed 3200/10170 r