In [None]:
#install
!pip install google-generativeai==0.7.2
!pip install langchain-community
!pip install crewai
!pip install requests
!pip install beautifulsoup4
!pip install pandas

Collecting protobuf (from google-generativeai==0.7.2)
  Using cached protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Using cached protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.5
    Uninstalling protobuf-5.29.5:
      Successfully uninstalled protobuf-5.29.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-proto 1.34.1 requires protobuf<6.0,>=5.0, but you have protobuf 4.25.8 which is incompatible.
ydf 0.12.0 requires protobuf<6.0.0,>=5.29.1, but you have protobuf 4.25.8 which is incompatible.[0m[31m
[0mSuccessfully installed protobuf-4.25.8


Collecting protobuf (from onnxruntime==1.22.0->crewai)
  Using cached protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Using cached protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.8
    Uninstalling protobuf-4.25.8:
      Successfully uninstalled protobuf-4.25.8
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 5.29.5 which is incompatible.[0m[31m
[0mSuccessfully installed protobuf-5.29.5




In [1]:
import os
import re
import json
import pandas as pd
import time
from dask import delayed, compute
from langchain_community.utilities import GoogleSerperAPIWrapper
import google.generativeai as genai

# Setup
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY", "AIzaSyB_zRgfdmObMachU5qkrNM-dAoG9Ae4Qgk")
os.environ["SERPER_API_KEY"] = os.getenv("SERPER_API_KEY", "4d17b38723a34498589c3edd102b7b4b175b65f3")
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

search = GoogleSerperAPIWrapper()
gemini_model = genai.GenerativeModel('gemini-1.5-flash')

# Utils
def extract_json(text):
    cleaned = re.sub(r"^```json\s*|```$", "", text.strip(), flags=re.MULTILINE)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        return {}

def format_name(name):
    name = re.sub(r'\s+', ' ', str(name).replace('\n', ' ')).strip().strip('"').strip("'")
    corp_terms = ['llc', 'inc', 'corp', 'ltd', 'company']
    if any(term in name.lower() for term in corp_terms):
        return name
    if ',' in name and name.count(',') == 1:
        last, first = name.split(',', 1)
        name = f"{first.strip()} {last.strip()}"
    return name

def clean_dataframe(df):
    df = df.copy()
    df['Franchisee'] = df['Franchisee'].apply(format_name)
    df['State'] = df['City'].str.strip() + ', ' + df['State'].str.strip()
    if 'FDD' in df.columns:
        df['Franchise Name'] = df['FDD']
    return df.dropna(axis=1, how='all')

def classify_franchisees(names):
    prompt = f"""Classify the following names as 'Individual' or 'Corporate':\n{names}\n\nRules:\n- Individual = Person name (John Doe)\n- Corporate = Includes LLC, Inc, Ltd, Corp, Company, etc.\n\nReturn only valid JSON, in this format:\n[{{"name": "Name", "type": "Individual/Corporate"}}]\nNo explanation. No markdown. If unknown, return []."""
    response = gemini_model.generate_content(prompt)
    return pd.DataFrame(extract_json(response.text.strip()))

def search_web(query: str):
    return search.results(query)

def safe_enrich(func, *args, retries=5, delay=5, **kwargs):
    for attempt in range(retries):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if "429" in str(e) or "TooManyRequests" in str(e):
                print(f"[429] Rate limit hit. Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
            else:
                print(f"[ERROR] {e}. Retrying in {delay} seconds...")
                time.sleep(delay)
    return {}

def enrich_individual(name, franchise, state):
    query1 = f"What company does {name} own that holds a {franchise} franchise in {state}?"
    snippet = search_web(query1)
    prompt1 = f"From this text, return company name owned by {name} for {franchise}. If not found, return '{name}'.\n{snippet}"
    legal_name = gemini_model.generate_content(prompt1).text.strip()

    query2 = f"Details of {legal_name} in {state}, include address, phone, email."
    snippet2 = search_web(query2)
    extract_prompt = f"""From this data, extract:
{{
  "legal_corporate_name": "{legal_name}",
  "corporate_address": "",
  "corporate_phone": "",
  "corporate_email": "",
  "owner_name": "{name}",
  "linkedin_url": "",
  "Source URLs used for enrichment": ""
}}
Return JSON only. No markdown or text.
{snippet2}"""
    result = gemini_model.generate_content(extract_prompt).text.strip()
    return extract_json(result)

def enrich_corporate(name, state):
    query = f"Who owns or manages {name} in {state}?"
    snippet = search_web(query)
    extract_prompt = f"""From this data, extract:
{{
  "legal_corporate_name": "{name}",
  "corporate_address": "",
  "corporate_phone": "",
  "corporate_email": "",
  "owner_name": "",
  "linkedin_url": "",
  "Source URLs used for enrichment": ""
}}
Return JSON only. No markdown or text.
{snippet}"""
    result = gemini_model.generate_content(extract_prompt).text.strip()
    return extract_json(result)

@delayed
def enrich_row(row_dict):
    name = row_dict["Franchisee"]
    state = row_dict["State"]
    franchise = row_dict.get("Franchise Name", "")

    try:
        entity_type_df = classify_franchisees([name])
        entity_type = entity_type_df['type'].values[0] if not entity_type_df.empty else "Unknown"

        if entity_type == "Individual":
            enriched = safe_enrich(enrich_individual, name, franchise, state)
        elif entity_type == "Corporate":
            enriched = safe_enrich(enrich_corporate, name, state)
        else:
            enriched = {}
    except Exception:
        enriched = {}

    defaults = {
        "legal_corporate_name": name,
        "corporate_address": "N/A",
        "corporate_phone": "N/A",
        "corporate_email": "N/A",
        "owner_name": name,
        "linkedin_url": "N/A",
        "Source URLs used for enrichment": "N/A",
        "Type": entity_type if 'entity_type' in locals() else "Unknown"
    }
    enriched = {**defaults, **enriched}
    return {**row_dict, **enriched}

def run_dask_enrichment(df):
    df = clean_dataframe(df)
    tasks = [enrich_row(row._asdict()) for row in df.itertuples(index=False)]
    results = compute(*tasks, scheduler='threads', num_workers=3)
    return pd.DataFrame(results)

if __name__ == "__main__":
    input_df = pd.read_excel("/content/Gra_1_50.xlsx")
    enriched_df = run_dask_enrichment(input_df)
    enriched_df.to_excel("enriched_franchisees_dask.xlsx", index=False)
    print("Saved to enriched_franchisees_dask.xlsx")

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory: '/content/Gra_1_50.xlsx'