In [None]:
# # %%
# # Install required packages (if you haven't)
# import sys
# !{sys.executable} -m pip install -q openai tiktoken python-dotenv tqdm pandas pinecone fitz

In [None]:
# import sys
# !{sys.executable} -m pip install pymupdf

In [None]:
# import fitz
# print(fitz.__doc__)

In [24]:
# %%
import os
import re
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone
from tqdm import tqdm
import fitz  # PyMuPDF
from dateutil.parser import parse
from datetime import datetime
from IPython.display import display, Markdown

In [2]:
# %%
# Load environment variables
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

assert OPENAI_API_KEY, "Missing OPENAI_API_KEY"
assert PINECONE_API_KEY, "Missing PINECONE_API_KEY"
assert PINECONE_ENV, "Missing PINECONE_ENV"

In [3]:
# %%
# Initialize clients
client = OpenAI(api_key=OPENAI_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
index_name = "funding-search"

# Check/create index if needed
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI text-embedding-3-small dimension
        metric="cosine",
        environment=PINECONE_ENV,
    )
index = pc.Index(index_name)

In [4]:
# %%
# Load full funding dataset (for keyword backup and relevance scoring)
full_df = pd.read_csv("/Users/kiranmulawad/AI-Funding/2_preprocessing/data/merged_funding_data.csv")

In [5]:
# %%
# (Optional) Extract PDF profile text function, if using PDF profile to generate queries
def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

# Example usage:
# pdf_path = "/path/to/sample_user_profile.pdf"
# pdf_text = extract_text_from_pdf(pdf_path)
# print("PDF Text Sample:", pdf_text[:500])


In [6]:
# %%
# You can either generate a query automatically from user profile text or set manually
# Example manual query string (replace with dynamic generation if you wish)
query = "We are an AI company focused on AI for robotics. We are focusing on research right now."
user_location = "Rhineland-Palatinate"  # Or adapt as needed

In [7]:
# %%
# Step 1: Embed the query with OpenAI embedding model
query_embedding = client.embeddings.create(
    input=[query],
    model="text-embedding-3-small"
).data[0].embedding

In [8]:
# %%
# Step 2: Semantic search with Pinecone
namespace = "openai-v3"  # use your correct namespace here
top_k = 5

semantic_matches = index.query(
    vector=query_embedding,
    top_k=top_k,
    include_metadata=True,
    namespace=namespace
)

In [9]:
# %%
# Step 3: Check semantic score, fallback to keyword filter if too low
top_score = max(match["score"] for match in semantic_matches["matches"]) if semantic_matches["matches"] else 0

def hybrid_keyword_filter(df, query, top_k=5):
    keywords = set(re.findall(r'\b\w+\b', query.lower()))
    matched_rows = []
    for _, row in df.iterrows():
        combined_text = " ".join([
            str(row.get(col, "")).lower()
            for col in ["name", "description", "domain", "eligibility"]
        ])
        if any(word in combined_text for word in keywords):
            matched_rows.append(row)
    return pd.DataFrame(matched_rows).head(top_k)

if top_score < 0.6:
    print("⚠️ Low semantic match score (<0.6), switching to keyword fallback...")
    results_df = hybrid_keyword_filter(full_df, query, top_k=top_k)
else:
    results_df = pd.DataFrame([m["metadata"] for m in semantic_matches["matches"]])

⚠️ Low semantic match score (<0.6), switching to keyword fallback...


In [18]:
matches = [m["metadata"] for m in semantic_matches["matches"]]

In [None]:
import numpy as np
from dateutil import parser
from datetime import datetime

def safe_parse_deadline(deadline_str):
    try:
        if pd.isna(deadline_str) or deadline_str.strip() == "":
            return None
        # Parse with dayfirst=True and fuzzy parsing to handle many date formats
        return parser.parse(deadline_str, dayfirst=True, fuzzy=True)
    except Exception:
        return None

# Suppose `semantic_matches` is from your Pinecone query result
# e.g. semantic_matches = index.query(...)

print(f"Total Pinecone matches: {len(semantic_matches['matches'])}")

# Build DataFrame from metadata extracted from Pinecone matches
results_df = pd.DataFrame([m["metadata"] for m in semantic_matches["matches"]])

print(f"Rows before cleaning: {len(results_df)}")

# Replace placeholders indicating missing data with NaN for better handling
placeholder_values = [
    "deadline information not found",
    "amount information not found",
    "contact information not found",
    "procedure information not found",
    "location information not found",
    ""  # empty string too
]

results_df["deadline"] = results_df["deadline"].replace(placeholder_values, np.nan)
results_df["deadline"] = results_df["deadline"].infer_objects(copy=False)

# Parse deadlines
results_df["deadline_date"] = results_df["deadline"].apply(safe_parse_deadline)

# Convert to pandas datetime to ensure proper dtype and facilitate date operations
results_df["deadline_date"] = pd.to_datetime(results_df["deadline_date"], errors="coerce")

# Calculate days left until deadline
results_df["days_left"] = (results_df["deadline_date"] - datetime.now()).dt.days

print("Deadline parsing preview:")
print(results_df[["deadline", "deadline_date", "days_left"]].head())

# Filter rows: keep those with missing deadlines or with deadline in the future (days_left >= 0)
results_df = results_df[
    (results_df["days_left"].isna()) | (results_df["days_left"] >= 0)
]

print(f"Rows after filtering deadlines (keep missing + future): {len(results_df)}")

if results_df.empty:
    print("⚠️ Warning: No relevant rows remain after deadline filtering.")
else:
    print(results_df.head())


Total Pinecone matches: 5
Rows before cleaning: 5
Deadline parsing preview:
        deadline deadline_date  days_left
0  July 16, 2025    2025-07-16      -14.0
1            NaN           NaT        NaN
2  June 04, 2025    2025-06-04      -56.0
3            NaN           NaT        NaN
4  June 12, 2025    2025-06-12      -48.0
Rows after filtering deadlines (keep missing + future): 2
                         amount  \
1  amount information not found   
3  amount information not found   

                                             contact deadline  \
1  Name: Projektträger Jülich (PtJ), Email: ptj@f...      NaN   
3  Name: DLR Projektträger, Email: ge@dlr.de, Pho...      NaN   

                                         description  \
1  Guideline for the funding of projects on the t...   
3  Guideline for the funding of interdisciplinary...   

                                              domain  \
1  Research & Innovation (topic-specific), Health...   
3             Research & Innova

In [20]:
# %%
# Step 5: Compute custom relevance scores, including location boost
def compute_relevance_score(row, query, funding_need=200000, target_domain="AI", user_location="Rhineland-Palatinate"):
    score = 0.0

    # Domain match boost
    if target_domain.lower() in str(row.get("domain", "")).lower():
        score += 0.4

    # Funding amount boost
    try:
        amount_val = int(re.sub(r'[^\d]', '', str(row.get("amount", "0"))))
        if amount_val >= funding_need:
            score += 0.3
    except:
        pass

    # Deadline relevance boost
    if "month" in str(row.get("deadline", "")).lower() or "2025" in str(row.get("deadline", "")):
        score += 0.2

    # Keyword presence in description
    if any(word.lower() in str(row.get("description", "")).lower() for word in query.split()):
        score += 0.1

    # User location boost
    if user_location.lower() in str(row.get("location", "")).lower():
        score += 0.1

    return round(score * 100)

results_df["relevance_score"] = results_df.apply(
    lambda r: compute_relevance_score(r, query, user_location=user_location),
    axis=1
)
results_df = results_df.sort_values(by="relevance_score", ascending=False)

matches = results_df.to_dict(orient="records")

In [13]:
print(results_df.head())

                         amount  \
1  amount information not found   
3  amount information not found   

                                             contact deadline  \
1  Name: Projektträger Jülich (PtJ), Email: ptj@f...      NaN   
3  Name: DLR Projektträger, Email: ge@dlr.de, Pho...      NaN   

                                         description  \
1  Guideline for the funding of projects on the t...   
3  Guideline for the funding of interdisciplinary...   

                                              domain  \
1  Research & Innovation (topic-specific), Health...   
3             Research & Innovation (topic-specific)   

                                         eligibility  \
1  University, research institution, company, ass...   
3          Research institution, university, company   

                                 id    location  \
1  cd21459049ca19b22a72fffcda329c79  Nationwide   
3  7f11351b7489d381f3c2e474b06d25eb  Nationwide   

                                     

In [21]:
def generate_structured_funding_blocks(matches, user_query: str) -> str:
    formatted_blocks = []

    field_aliases = {
        "Amount": ["amount", "how much", "funding", "money"],
        "Deadline": ["deadline", "last date", "until", "submission date"],
        "Eligibility": ["eligible", "eligibility", "who can apply"],
        "Procedure": ["procedure", "how to apply", "application", "steps", "process"],
        "Contact": ["contact", "email", "person", "support"],
    }

    for idx, meta in enumerate(matches, start=1):
        # 'meta' is directly the dict with funding info
        name = meta.get("name", "Unnamed")
        fields = {
            "Description": meta.get("description"),
            "Domain": meta.get("domain"),
            "Eligibility": meta.get("eligibility"),
            "Amount": meta.get("amount"),
            "Deadline": meta.get("deadline"),
            "Procedure": meta.get("procedure"),
            "Location": meta.get("location"),
            "Contact": meta.get("contact"),
            "URL": meta.get("url"),
            "Source": meta.get("source", "Unknown")
        }
        block = f"**{idx}. {name}**\n"
        for key, val in fields.items():
            if key == "Deadline":
                deadline_val = val
                if deadline_val is None or pd.isna(deadline_val) or "not found" in str(deadline_val).lower():
                    deadline_str = "Not specified"
                else:
                    days_left = meta.get("days_left", None)
                    if pd.notnull(days_left):
                        deadline_str = f"{deadline_val} (🕒 {int(days_left)} days left)"
                    else:
                        deadline_str = deadline_val
                block += f"   - **Deadline**: {deadline_str}\n"
                continue
            if val and "not found" not in str(val).lower():
                block += f"   - **{key}**: {val}\n"
        if fields["URL"]:
            block += f"   - **For more information visit**: {fields['URL']}\n"
        formatted_blocks.append(block + "\n")  # <-- Add blank line between blocks
    return "\n".join(formatted_blocks)


In [25]:
semantic_output = generate_structured_funding_blocks(matches, query)
# Display the structured output
display(Markdown(semantic_output))

**1. Funding for projects on the topic of "Application of artificial intelligence (AI) in drug discovery"**
   - **Description**: Guideline for the funding of projects on the topic of "Application of artificial intelligence (AI) in drug discovery" dated: 19.02.2025 Federal Ministry of Education and Research BAnz AT 03.03.2025 B5
   - **Domain**: Research & Innovation (topic-specific), Health & Social Affairs
   - **Eligibility**: University, research institution, company, association/association
   - **Deadline**: Not specified
   - **Location**: Nationwide
   - **Contact**: Name: Projektträger Jülich (PtJ), Email: ptj@fz-juelich.de, Phone: Tel: +49 2461 61-2626, Address: 52425 Jülich
   - **URL**: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/anwendung-ki-wirkstoffforschung.html
   - **Source**: foerderdatenbank
   - **For more information visit**: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/anwendung-ki-wirkstoffforschung.html


**2. Promotion of interdisciplinary pilot projects on the topic of "Neurobiologically inspired artificial intelligence"**
   - **Description**: Guideline for the funding of interdisciplinary pilot projects on the topic of "Neurobiologically inspired artificial intelligence" dated: 25.11.2024 Federal Ministry of Education and Research BAnz AT 04.12.2024 B5
   - **Domain**: Research & Innovation (topic-specific)
   - **Eligibility**: Research institution, university, company
   - **Deadline**: Not specified
   - **Location**: Nationwide
   - **Contact**: Name: DLR Projektträger, Email: ge@dlr.de, Phone: Tel: +49 0228 3821-1210, Address: 53227 Bonn
   - **URL**: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/bmbf-neurobiologisch-inspirierte-ki.html
   - **Source**: foerderdatenbank
   - **For more information visit**: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/bmbf-neurobiologisch-inspirierte-ki.html



In [26]:
# %%
# Prepare prompt for GPT recommendation generation (you can customize further)
llm_prompt = f"""
The company described itself as:

"{query}"

Here are the top 5 most relevant public funding programs in Germany, based on a semantic search match to their needs:

{semantic_output}

Now:

Please write a concise and professional recommendation containing **only the top 2–3 most relevant funding programs** in this format:

Only select the top programs that most directly match the company’s domain, maturity stage (e.g., early-stage research), or funding needs. Ignore entries that are vague or poorly aligned.

For each recommendation, follow this format exactly:

1. <Program Name> (Source) 
**Why it fits**: <1–2 lines explaining relevance to the company’s domain (or) industry (or) field of work>  
**Description**: <Brief summary of the program’s goal and what it funds>
**Domain**: <Domain>      
**Eligibility**: <Eligibility>  
**Amount**: <Amount>  
**Deadline**: <Deadline (date or timeframe)>
**Location**: <Location or applicable regions>  
**Contact**: <Contact person, email, or organization>  
**Next Steps**:  
- Step 1: [Visit the (source) official page:]({{url}})  
- Step 2: <One key action the company must take>  
- Step 3: <Another action (e.g., submit proposal, form consortium)>  

If any field like **Amount**, **Deadline**, **Eligibility**, **Procedure**, or **Contact** is missing, either omit the line or say “Not specified”.

Use simple bullet points under **Next Steps**. Only list the top 2 or 3 programs — not all 5.
"""


In [27]:
# %%
# Generate GPT recommendation using chat completion
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {"role": "system", "content": "You are an expert in funding opportunities."},
        {"role": "user", "content": llm_prompt}
    ]
)

In [28]:
# Print GPT recommendation output
print("\n🧾 GPT Recommendation:\n")
print(response.choices[0].message.content)


🧾 GPT Recommendation:

1. Promotion of interdisciplinary pilot projects on the topic of "Neurobiologically inspired artificial intelligence" (foerderdatenbank)  
**Why it fits**: Directly aligns with AI research with a neurobiological inspiration which could intersect with AI for robotics.  
**Description**: This program supports interdisciplinary research projects aiming to develop artificial intelligence inspired by neurobiological processes.  
**Domain**: Research & Innovation (topic-specific)  
**Eligibility**: Research institution, university, company  
**Amount**: Not specified  
**Deadline**: Not specified  
**Location**: Nationwide  
**Contact**: DLR Projektträger, Email: ge@dlr.de  
**Next Steps**:  
- Step 1: [Visit the official page](https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/bmbf-neurobiologisch-inspirierte-ki.html)  
- Step 2: Review the program details and alignment with your company’s research objectives  
- Step 3: Prepare a detailed propo