In [None]:
# %%
import os
import re
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from ollama import Client
from dateutil import parser
from datetime import datetime

In [2]:
# %%
# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


In [3]:
# %%
# Load full funding dataset (CSV)
full_df = pd.read_csv("/Users/kiranmulawad/AI-Funding/2_preprocessing/data/merged_funding_data.csv")


In [4]:
# %%
# Initialize Pinecone + embedding model
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("funding-search-bge")
model = SentenceTransformer("BAAI/bge-small-en")


In [15]:
# %%
# Sample query
query = "We are an AI company focused on AI for robotics. We are focusing on research right now."
query_embedding = model.encode(query).tolist()


In [16]:
user_location = "Rhineland-Palatinate"

In [17]:
# %%
# Query both CSV + PDF-upload data
semantic_matches = index.query(
    vector=query_embedding, top_k=5, include_metadata=True, namespace="open-source-v1"
)
pdf_matches = index.query(
    vector=query_embedding, top_k=5, include_metadata=True, namespace="pdf-upload"
)


In [18]:
# %%
# Merge + deduplicate
all_matches = semantic_matches["matches"] + pdf_matches["matches"]
seen = set()
unique_matches = []
for m in all_matches:
    key = (m["metadata"].get("name", ""), m["metadata"].get("description", ""))
    if key not in seen:
        unique_matches.append(m)
        seen.add(key)

print(f"✅ Total unique results: {len(unique_matches)}")


✅ Total unique results: 5


In [19]:
# %%
# Create DataFrame
results_df = pd.DataFrame([m["metadata"] for m in unique_matches])

# Parse deadline and calculate days left
def safe_parse_deadline(deadline_str):
    try:
        return parser.parse(deadline_str, dayfirst=True, fuzzy=True)
    except:
        return None

results_df["deadline_date"] = results_df["deadline"].apply(safe_parse_deadline)
results_df["days_left"] = (results_df["deadline_date"] - datetime.now()).dt.days
results_df = results_df[results_df["days_left"].fillna(0) >= 0]


In [20]:
def compute_relevance_score(row, query, funding_need="200000", target_domain="AI", user_location="Rhineland-Palatinate"):
    score = 0

    # 🧠 Domain match
    if target_domain.lower() in str(row.get("domain", "")).lower():
        score += 0.4

    # 💰 Funding amount
    try:
        amount_val = int(re.sub(r"[^\d]", "", str(row.get("amount", "0"))))
        if amount_val >= int(funding_need):
            score += 0.3
    except:
        pass

    # 📅 Deadline relevance
    if "month" in str(row.get("deadline", "")).lower() or "2025" in str(row.get("deadline", "")):
        score += 0.2

    # 🔍 Keyword match in description
    if any(word.lower() in str(row.get("description", "")).lower() for word in query.split()):
        score += 0.1

    # 📍 Location boost
    if user_location.lower() in str(row.get("location", "")).lower():
        score += 0.1

    return round(score * 100)

results_df["relevance_score"] = results_df.apply(
    lambda r: compute_relevance_score(r, query, user_location=user_location), axis=1
)
results_df = results_df.sort_values(by="relevance_score", ascending=False)
matches = [{"metadata": r} for r in results_df.to_dict("records")]


In [21]:
# %%
# Format blocks for GPT
def generate_structured_funding_blocks(matches, user_query: str) -> str:
    formatted_blocks = []

    field_aliases = {
        "Amount": ["amount", "how much", "funding", "money"],
        "Deadline": ["deadline", "last date", "until", "submission date"],
        "Eligibility": ["eligible", "eligibility", "who can apply"],
        "Procedure": ["procedure", "how to apply", "application", "steps", "process"],
        "Contact": ["contact", "email", "person", "support"],
    }

    for idx, match in enumerate(matches, start=1):
        meta = match["metadata"]
        name = meta.get("name", "Unnamed")

        fields = {
            "Description": meta.get("description"),
            "Domain": meta.get("domain"),
            "Eligibility": meta.get("eligibility"),
            "Amount": meta.get("amount"),
            "Deadline": meta.get("deadline"),
            "Procedure": meta.get("procedure"),
            "Location": meta.get("location"),
            "Contact": meta.get("contact"),
            "URL": meta.get("url"),
            "Source": meta.get("source", "Unknown")
        }

        missing_fields = []
        for key, value in fields.items():
            if key in field_aliases:
                if any(alias in user_query.lower() for alias in field_aliases[key]):
                    if not value or "not found" in str(value).lower():
                        missing_fields.append(key)

        block = f"""**{idx}. {name}**\n"""
        for key in ["Description", "Domain", "Eligibility", "Amount", "Deadline", "Procedure", "Contact", "Location", "Source"]:
            val = fields[key]

            if key == "Deadline" and val and "not found" not in str(val).lower():
                days_left = meta.get("days_left", None)
                if pd.notnull(days_left):
                    val += f" (🕒 {int(days_left)} days left)"

            if val and "not found" not in str(val).lower():
                block += f"   - **{key}**: {val}\n"

        if missing_fields:
            block += f"   - *Couldn't trace information about {', '.join(missing_fields)}.*\n"

        if fields["URL"]:
            block += f"   - **For more information visit**: {fields['URL']}\n"

        formatted_blocks.append(block)

    return "\n".join(formatted_blocks)


In [22]:
# %%
semantic_output = generate_structured_funding_blocks(matches, query)
print(semantic_output)


**1. Funding for projects on the topic of "Application of artificial intelligence (AI) in drug discovery"**
   - **Description**: Guideline for the funding of projects on the topic of "Application of artificial intelligence (AI) in drug discovery" dated: 19.02.2025 Federal Ministry of Education and Research BAnz AT 03.03.2025 B5
   - **Domain**: Research & Innovation (topic-specific), Health & Social Affairs
   - **Eligibility**: University, research institution, company, association/association
   - **Contact**: Name: Projektträger Jülich (PtJ), Email: ptj@fz-juelich.de, Phone: Tel: +49 2461 61-2626, Address: 52425 Jülich
   - **Location**: Nationwide
   - **Source**: foerderdatenbank
   - **For more information visit**: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/anwendung-ki-wirkstoffforschung.html

**2. Promotion of interdisciplinary pilot projects on the topic of "Neurobiologically inspired artificial intelligence"**
   - **Description**: Guideline for

In [23]:

# %%
# Prepare LLaMA prompt
llm_prompt = f"""
The company described itself as:

"{query}"

Here are the top 5 most relevant public funding programs in Germany, based on a semantic search match to their needs:

{semantic_output}

Now:

Please write a concise and professional recommendation containing **only the top 2–3 most relevant funding programs** in this format:

Only select the top programs that most directly match the company’s domain, maturity stage (e.g., early-stage research), or funding needs. Ignore entries that are vague or poorly aligned.

For each recommendation, follow this format exactly:

1. <Program Name>  
Why it fits: <1–2 lines explaining relevance to the company’s domain (or) industry (or) field of work>  
**Description**: <1–3 sentence summary of what the program funds and its focus>  
**Domain**: <Domain>  
**Eligibility**: <Eligibility>  
**Amount**: <Amount>  
**Deadline**: <Deadline>
**Location**: <Location>
**Contact**: <Contact person, email, or organization — not the URL>  
**Next Steps**:  
- Step 1: <Visit the official call page>  
- Step 2: <Provide one or two helpful next steps, e.g., form a consortium, prepare documents>  
- Step 3: (Optional) <Any extra steps from the program's procedure>

If any field like **Amount**, **Deadline**, **Eligibility**, **Procedure**, **Contact**, or **Domain** is missing, either omit the line or say “Not specified”.

Make sure **Next Steps** are formatted as real bullet points on new lines — not in a single line. Use simple and readable language. List only the top 2 or 3 matching programs — not all 5.
"""

In [24]:
# %%
# Run LLaMA
client = Client(host="http://localhost:11434")
response = client.generate(model="llama3.2", prompt=llm_prompt, stream=False)

In [25]:
# %%
# Display output
print("\n🧾 LLaMA 3.2 Recommendation:\n")
print(response["response"])


🧾 LLaMA 3.2 Recommendation:

Here is a concise and professional recommendation containing the top 2-3 most relevant funding programs:

1. **Funding for projects on the topic of "Application of artificial intelligence (AI) in drug discovery"**
Why it fits: This program directly aligns with the company's domain of AI for robotics, specifically focusing on AI applications in a relevant field.
**Description**: The program funds research projects exploring the application of AI in drug discovery, aiming to develop innovative solutions for improving pharmaceutical development processes.
**Domain**: Research & Innovation (topic-specific), Health & Social Affairs
**Eligibility**: University, research institution, company, association/association
**Amount**: Not specified
**Deadline**: Not specified
**Location**: Nationwide
**Contact**: Projektträger Jülich (PtJ)
**Next Steps**
- Step 1: Visit the official call page https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/anwen