# Preparation and Import Library

In [94]:
pip install gradio pandas scikit-learn sentence-transformers Faker fuzzywuzzy python-Levenshtein

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [95]:
import pandas as pd
import numpy as np
from faker import Faker
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process
import gradio as gr

# Preparing Data

In [96]:
def load_dataset(filename="/kaggle/input/icp-dataset/static_lead_dataset_10000.csv"):
    """Loads the static dataset from a CSV file."""
    print(f"--- Part 1: Loading Static Dataset from '{filename}' ---")
    try:
        df = pd.read_csv(filename)
        print(f"Dataset successfully loaded with {len(df)} rows.\n")
        return df
    except FileNotFoundError:
        print(f"Error: Dataset file not found at '{filename}'")
        return None

def rank_leads_by_similarity(prospects_df, pipeline):
    if len(prospects_df) < 2:
        return prospects_df
    num_to_sample = min(2, len(prospects_df))
    user_selected_icp_names = prospects_df.head(num_to_sample)['company_name'].tolist()
    text_vectors = pipeline['sbert'].encode(prospects_df['description'].tolist())
    num_vectors = pipeline['scaler'].transform(prospects_df[['employee_count', 'annual_revenue_millions_usd']])
    all_vectors = np.hstack((num_vectors, text_vectors))
    icp_indices = prospects_df[prospects_df['company_name'].isin(user_selected_icp_names)].index.tolist()
    original_to_array_indices = {original_idx: array_idx for array_idx, original_idx in enumerate(prospects_df.index)}
    icp_array_indices = [original_to_array_indices[idx] for idx in icp_indices]
    icp_vectors = all_vectors[icp_array_indices]
    icp_target_vector = np.mean(icp_vectors, axis=0).reshape(1, -1)
    similarity_scores = cosine_similarity(icp_target_vector, all_vectors)
    ranked_df = prospects_df.copy()
    ranked_df['icp_score'] = similarity_scores[0]
    return ranked_df.sort_values(by='icp_score', ascending=False).reset_index(drop=True)

In [97]:
print("Performing model setup... Please wait.")
full_dataset = load_dataset()

Performing model setup... Please wait.
--- Part 1: Loading Static Dataset from '/kaggle/input/icp-dataset/static_lead_dataset_10000.csv' ---
Dataset successfully loaded with 10000 rows.



In [98]:
full_dataset.head(10)

Unnamed: 0,company_name,industry,employee_count,annual_revenue_millions_usd,description,is_good_lead
0,"White, Roy and Gonzalez Inc.",Public Relations,191,172,A Public Relations company focused on repurpos...,1
1,"Shaw, Davis and Murphy Solutions",Cloud Computing,5,2,A Cloud Computing company focused on engage di...,0
2,Moss LLC LLC,Venture Capital,5,3,A Venture Capital company focused on generate ...,0
3,Griffith LLC LLC,Medical Devices,6,3,A Medical Devices company focused on seize use...,0
4,Johnson-Butler Inc.,Oil & Gas,331,99,A Oil & Gas company focused on benchmark missi...,1
5,Bradley PLC Inc.,Consulting,697,7,A Consulting company focused on streamline e-b...,1
6,Campbell-Ferguson Inc.,Cybersecurity,708,117,A Cybersecurity company focused on engage dist...,1
7,Perry-Schneider LLC,Renewable Energy,5,3,A Renewable Energy company focused on innovate...,0
8,Macias PLC Solutions,Automotive,55,145,A Automotive company focused on seize B2B supp...,1
9,Brown and Sons LLC,Publishing,420,116,A Publishing company focused on re-intermediat...,1


# EDA

# Model Training

In [99]:
# Train First Part Model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
text_features = sbert_model.encode(full_dataset['description'])
numerical_features_df = full_dataset[['employee_count', 'annual_revenue_millions_usd']]
scaler = MinMaxScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features_df)
X = np.hstack((numerical_features_scaled, text_features))
y = full_dataset['is_good_lead']
classifier_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
classifier_model.fit(X, y)

# Save Component
PIPELINE = {'sbert': sbert_model, 'scaler': scaler, 'model': classifier_model}
print("Setup complete. Gradio app is starting...")

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Setup complete. Gradio app is starting...


# Model Evaluation

# Input Output Programm

In [100]:
def find_leads(icp_industries_str):
    """
    The main function called by Gradio.
    Takes user input, runs the hybrid model, and returns a Markdown table.
    """
    if not icp_industries_str:
        return "Please enter at least one industry to begin."

    # Process user input into a list
    user_target_industries = [item.strip() for item in icp_industries_str.split(',')]
    
    # Simulate Scraping based on user input with fuzzy matching
    scraped_leads_df = full_dataset.copy()
    
    matched_industries = []
    for user_industry in user_target_industries:
        best_match, score = process.extractOne(user_industry, full_dataset['industry'].unique())
        if score >= 80:
            matched_industries.append(best_match)
    
    if not matched_industries:
        return f"Could not find any matching industries for: `{icp_industries_str}`"
    
    scraped_leads_df = scraped_leads_df[scraped_leads_df['industry'].isin(matched_industries)]

    if scraped_leads_df.empty:
        return f"No companies found for industries: `{', '.join(matched_industries)}`"

    # Stage 1: Filter with the classification model
    vectors_scraped = np.hstack((
        PIPELINE['scaler'].transform(scraped_leads_df[['employee_count', 'annual_revenue_millions_usd']]),
        PIPELINE['sbert'].encode(scraped_leads_df['description'].tolist())
    ))
    initial_predictions = PIPELINE['model'].predict(vectors_scraped)
    scraped_leads_df['initial_filter_result'] = initial_predictions
    
    filtered_leads_df = scraped_leads_df[scraped_leads_df['initial_filter_result'] == 1].copy().reset_index(drop=True)
    
    if filtered_leads_df.empty:
        return "After the initial AI filter, no relevant prospects were found. Try a broader search."

    # Stage 2: Personalized ranking
    final_ranked_prospects = rank_leads_by_similarity(filtered_leads_df, PIPELINE)
    
    # Return the final result as a Markdown formatted table
    return final_ranked_prospects[['company_name', 'industry', 'employee_count', 'annual_revenue_millions_usd', 'description', 'icp_score']].to_markdown(index=False)

# User Interface

In [101]:
saas_theme = gr.themes.Base(
    primary_hue=gr.themes.colors.blue,
    secondary_hue=gr.themes.colors.blue,
    neutral_hue=gr.themes.colors.gray,
    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
).set(
    body_background_fill="#0A0F1E",
    body_text_color="#FFFFFF",
    block_background_fill="#1C2A4A",
    block_border_width="0px",
    block_radius="8px",
    block_label_text_color="#FFFFFF",
    input_background_fill="#2A3B5A",
    button_primary_background_fill="#3B82F6",
    button_primary_text_color="#FFFFFF",
    body_background_fill_dark="#0A0F1E",
    body_text_color_dark="#FFFFFF",
    background_fill_primary="#1C2A4A",
    background_fill_primary_dark="#1C2A4A",
    block_background_fill_dark="#1C2A4A",
    border_color_accent="#1C2A4A",
    border_color_accent_dark="#1C2A4A",
    border_color_primary="#1C2A4A",
    border_color_primary_dark="#1C2A4A",
    color_accent_soft_dark="transparent",
    body_text_weight="400"
)

with gr.Blocks(theme=saas_theme) as demo:
    gr.HTML("""
    <div style="text-align: center; max-width: 800px; margin: 0 auto;">
        <h1 style="font-size: 3em; color: #FFFFFF; font-weight: 700;">
            Transform Your Lead Generation
        </h1>
        <p style="font-size: 1.2em; color: #A0AEC0; margin-bottom: 20px;">
            Enter your target industries to find and rank potential leads using our AI-powered model.
        </p>
    </div>
    """)
    
    with gr.Row():
        industry_input = gr.Textbox(
            label="Target Industries", 
            placeholder="e.g., SaaS, Fintech, AI (separate with comma)",
            scale=3
        )
        submit_button = gr.Button("Find Leads", variant="primary", scale=1)
        
    output_markdown = gr.Markdown(label="Ranked Leads")
    
    submit_button.click(
        fn=find_leads, 
        inputs=industry_input,
        outputs=output_markdown 
    )

if __name__ == "__main__":
    demo.launch()

* Running on local URL:  http://127.0.0.1:7873
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://0611c63173442f8f21.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]