In [11]:
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("lead_results.csv", index=False)


In [9]:
# === IMPORTS ===
import requests
from requests.auth import HTTPBasicAuth
from transformers import pipeline
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import time

# === API KEYS ===
SERPER_API_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
HUNTER_API_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
DATAFORSEO_USERNAME = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
DATAFORSEO_PASSWORD = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

# === Load Hugging Face QA Model ===
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# === Helper Functions ===

def get_website_from_serper(company_name):
    url = "https://google.serper.dev/search"
    headers = {
        "X-API-KEY": SERPER_API_KEY,
        "Content-Type": "application/json"
    }
    try:
        res = requests.post(url, headers=headers, json={"q": company_name})
        data = res.json()
        if 'knowledgeGraph' in data and 'website' in data['knowledgeGraph']:
            return data['knowledgeGraph']['website']
        for result in data.get('organic', []):
            if 'link' in result:
                return result['link']
    except Exception as e:
        print("Serper error:", e)
    return None

def get_email_from_hunter(domain):
    url = f"https://api.hunter.io/v2/domain-search?domain={domain}&api_key={HUNTER_API_KEY}"
    try:
        res = requests.get(url)
        data = res.json()
        return [e['value'] for e in data.get("data", {}).get("emails", [])]
    except Exception as e:
        print("Hunter error:", e)
    return []

def extract_emails_from_website(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return list(set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", soup.get_text())))
    except Exception as e:
        print("Scraping error:", e)
        return []

def get_ceo_from_dataforseo(company_name):
    url = "https://api.dataforseo.com/v3/serp/google/organic/live/advanced"
    payload = [{
        "keyword": f"{company_name} CEO",
        "location_name": "United States",
        "language_code": "en",
        "device": "desktop"
    }]
    full_name_pattern = re.compile(r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b")

    try:
        res = requests.post(url, auth=HTTPBasicAuth(DATAFORSEO_USERNAME, DATAFORSEO_PASSWORD), json=payload)
        data = res.json()
        items = data['tasks'][0]['result'][0]['items']
        best_result = None
        for item in items:
            text = item.get("description", "") or item.get("title", "")
            if "CEO" in text and company_name.lower() in text.lower():
                result = qa_model(question=f"Who is the CEO of {company_name}?", context=text)
                if result['score'] > 0.75 and full_name_pattern.match(result['answer']):
                    return result['answer']
                if not best_result or result['score'] > best_result['score']:
                    best_result = result
        if best_result and full_name_pattern.match(best_result['answer']):
            return best_result['answer']
    except Exception as e:
        print("DataForSEO CEO error:", e)
    return "Not Found"

def get_linkedin_from_dataforseo(company_name):
    url = "https://api.dataforseo.com/v3/serp/google/organic/live/advanced"
    payload = [{
        "keyword": f"{company_name} LinkedIn",
        "location_name": "United States",
        "language_code": "en",
        "device": "desktop"
    }]
    try:
        res = requests.post(url, auth=HTTPBasicAuth(DATAFORSEO_USERNAME, DATAFORSEO_PASSWORD), json=payload)
        data = res.json()
        items = data['tasks'][0]['result'][0]['items']
        for item in items:
            if "linkedin.com/company" in item.get("url", ""):
                return item["url"]
    except Exception as e:
        print("LinkedIn scrape error:", e)
    return "Not Found"

def get_revenue_and_employees(company_name, username, password):
    url = "https://api.dataforseo.com/v3/serp/google/organic/live/advanced"
    payload = [{
        "keyword": f"{company_name} revenue and employees",
        "location_name": "United States",
        "language_code": "en",
        "device": "desktop"
    }]

    revenue_pattern = re.compile(r"\$[0-9,.]+\s*(billion|million|B|M)?", re.IGNORECASE)
    employee_pattern = re.compile(
        r"\b(?:over\s*)?([0-9,.]+(?:\s*[kKmMbB])?)\s*(?:employees|people|staff|personnel)\b", re.IGNORECASE
    )

    try:
        res = requests.post(url, auth=HTTPBasicAuth(username, password), json=payload)
        data = res.json()
        items = data['tasks'][0]['result'][0]['items']

        revenue = None
        employee_counts = []

        for item in items:
            text = item.get("description", "") or item.get("title", "")
            if not revenue:
                rev_match = revenue_pattern.search(text)
                if rev_match:
                    revenue = rev_match.group()

            emp_matches = employee_pattern.findall(text)
            for emp_raw in emp_matches:
                emp_clean = emp_raw.replace(",", "").strip().lower()
                try:
                    if 'k' in emp_clean:
                        count = float(emp_clean.replace('k', '')) * 1_000
                    elif 'm' in emp_clean:
                        count = float(emp_clean.replace('m', '')) * 1_000_000
                    elif 'b' in emp_clean:
                        count = float(emp_clean.replace('b', '')) * 1_000_000_000
                    else:
                        count = float(emp_clean)

                    if 1_000 <= count <= 1_000_000:
                        employee_counts.append(int(count))
                except:
                    continue

        if employee_counts:
            best_emp_estimate = max(set(employee_counts), key=employee_counts.count) if len(set(employee_counts)) < len(employee_counts) else max(employee_counts)
            employees = f"~{int(best_emp_estimate):,} employees"
        else:
            employees = "Approximate size not found"

        return revenue or "Not Found", employees

    except Exception as e:
        print("Revenue/Employees extract error:", e)
        return "Not Found", "Not Found"

# === FINAL: Lead Generator ===
def get_lead_info(company_name):
    print(f"\n🔍 Processing: {company_name}")
    website = get_website_from_serper(company_name)
    print("🌐 Website:", website)

    domain = urlparse(website).netloc if website else None
    emails = get_email_from_hunter(domain) if domain else []
    if not emails and website:
        emails = extract_emails_from_website(website)
    print("📧 Emails:", emails)

    ceo_name = get_ceo_from_dataforseo(company_name)
    print("👤 CEO:", ceo_name)

    linkedin = get_linkedin_from_dataforseo(company_name)
    print("🔗 LinkedIn:", linkedin)

    revenue, employees = get_revenue_and_employees(company_name, DATAFORSEO_USERNAME, DATAFORSEO_PASSWORD)
    print("💰 Revenue:", revenue)
    print("👥 Employees:", employees)

    return {
        "company": company_name,
        "website": website,
        "emails": emails,
        "ceo_name": ceo_name,
        "linkedin_url": linkedin,
        "revenue": revenue,
        "employees": employees
    }

# === Optional: Batch Processing with Retry & Delay ===
def process_companies(companies):
    all_results = []
    for company in companies:
        result = get_lead_info(company)

        # Retry revenue/employees if not found
        if result["employees"] == "Approximate size not found" or result["employees"] == "Not Found":
            print("🔁 Retrying employee data...")
            result["revenue"], result["employees"] = get_revenue_and_employees(company, DATAFORSEO_USERNAME, DATAFORSEO_PASSWORD)

        all_results.append(result)
        time.sleep(2)  # prevent API overload
    return all_results


Device set to use cpu


In [10]:
companies = ["Apple", "Microsoft","Zomato"]
results = process_companies(companies)



🔍 Processing: Apple
🌐 Website: https://www.apple.com/
📧 Emails: ['rayna_schwartz@apple.com', 'missy_pool@apple.com', 'ashish.sharma@apple.com', 'hkrishnamurthy@apple.com', 'fkhan@apple.com', 'aminah_charles@apple.com', 'vishesh_yadav@apple.com', 'xiaoge_su@apple.com', 'anna_m@apple.com', 'lpan@apple.com']
👤 CEO: Tim Cook
🔗 LinkedIn: https://www.linkedin.com/company/apple
💰 Revenue: $391.04 billion
👥 Employees: ~164,000 employees

🔍 Processing: Microsoft
🌐 Website: https://www.microsoft.com/
📧 Emails: ['vinod.kumar@microsoft.com', 'carlos.fernandez@microsoft.com', 'pamela.almaguer@microsoft.com', 'rolf.harms@microsoft.com', 'patrick.larkin@microsoft.com', 'denise.begley@microsoft.com', 'mark.jacobson@microsoft.com', 'richard.rundle@microsoft.com', 'tomsmith@microsoft.com', 'lianw@microsoft.com']
👤 CEO: Satya Narayana Nadella
🔗 LinkedIn: https://www.linkedin.com/company/microsoft
💰 Revenue: $40.9 billion
👥 Employees: ~228,000 employees

🔍 Processing: Zomato
🌐 Website: https://www.zomato