In [None]:
# %%
# import sys
# !{sys.executable} -m pip install pinecone sentence-transformers python-dotenv pandas tqdm matplotlib

In [4]:
# %%
import os
import re
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from ollama import Client

In [5]:
# %%
# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [6]:
full_df = pd.read_csv("/Users/kiranmulawad/AI-Funding/2_preprocessing/data/merged_funding_data.csv")

In [7]:
# %%
# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("funding-search-bge")

In [8]:
# %%
# Sample user query
query = "We are an AI company focused on AI for robotics. We are focusing on research right now."

In [9]:
# %%
# Embed query using local BGE model
model = SentenceTransformer("BAAI/bge-small-en")
query_embedding = model.encode(query).tolist()

In [10]:
# %%
# Query Pinecone using the embedded query vector
semantic_matches = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True,
    namespace="open-source-v1"
)

In [None]:
# Query Pinecone for uploaded PDF chunks
pdf_matches = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True,
    namespace="pdf-upload"
)

In [None]:
# Merge all results
all_matches = semantic_matches["matches"] + pdf_matches["matches"]

# Deduplicate based on (name, description)
seen = set()
unique_matches = []
for m in all_matches:
    key = (m["metadata"].get("name", ""), m["metadata"].get("description", ""))
    if key not in seen:
        unique_matches.append(m)
        seen.add(key)

print(f"✅ Total unique results: {len(unique_matches)}")


In [55]:
import re

top_score = max(match["score"] for match in semantic_matches["matches"])
print(f"🔍 Top semantic score: {top_score:.4f}")

def hybrid_keyword_filter(df, query: str, top_k=5):
    keywords = set(re.findall(r'\b\w+\b', query.lower()))
    matched_rows = []
    for _, row in df.iterrows():
        combined_text = " ".join([
            str(row.get(col, "")).lower()
            for col in ["name", "description", "domain", "eligibility"]
        ])
        if any(word in combined_text for word in keywords):
            matched_rows.append(row)
    return pd.DataFrame(matched_rows).head(top_k)

# Switch between semantic and keyword fallback
if top_score < 0.6:
    print("⚠️ Low semantic match, switching to keyword fallback...")
    results_df = hybrid_keyword_filter(full_df, query)
else:
    results_df = pd.DataFrame([m["metadata"] for m in semantic_matches["matches"]])


🔍 Top semantic score: 0.7489


In [57]:
from dateutil.parser import parse
from datetime import datetime

# Safe parser for each deadline field
def safe_parse_deadline(deadline_str):
    try:
        return parse(deadline_str, dayfirst=True, fuzzy=True)
    except:
        return None  # Return None for anything unparseable

# Apply parsing
results_df["deadline_date"] = results_df["deadline"].apply(safe_parse_deadline)

# Calculate days left only if parsing succeeded
results_df["days_left"] = (results_df["deadline_date"] - datetime.now()).dt.days

In [58]:
results_df = results_df[results_df["days_left"].fillna(0) >= 0]

In [None]:
def compute_relevance_score(row, query, funding_need="200000", target_domain="AI"):
    score = 0
    if target_domain.lower() in str(row.get("domain", "")).lower():
        score += 0.4
    try:
        amount_val = int(re.sub(r"[^\d]", "", str(row.get("amount", "0"))))
        if amount_val >= int(funding_need):
            score += 0.3
    except:
        pass
    if "month" in str(row.get("deadline", "")).lower() or "2025" in str(row.get("deadline", "")):
        score += 0.2
    if any(word.lower() in str(row.get("description", "")).lower() for word in query.split()):
        score += 0.1
    return round(score * 100)

results_df = pd.DataFrame([m["metadata"] for m in unique_matches])

results_df["relevance_score"] = results_df.apply(lambda r: compute_relevance_score(r, query), axis=1)

results_df = results_df.sort_values(by="relevance_score", ascending=False)

matches = [{"metadata": r} for r in results_df.to_dict("records")]

In [74]:
# %%
# Format matches into readable funding blocks
def generate_structured_funding_blocks(matches, user_query: str) -> str:
    formatted_blocks = []

    field_aliases = {
        "Amount": ["amount", "how much", "funding", "money"],
        "Deadline": ["deadline", "last date", "until", "submission date"],
        "Eligibility": ["eligible", "eligibility", "who can apply"],
        "Procedure": ["procedure", "how to apply", "application", "steps", "process"],
        "Contact": ["contact", "email", "person", "support"],
    }

    for idx, match in enumerate(matches, start=1):
        meta = match
        name = meta.get("name", "Unnamed")

        fields = {
            "Description": meta.get("description"),
            "Domain": meta.get("domain"),
            "Eligibility": meta.get("eligibility"),
            "Amount": meta.get("amount"),
            "Deadline": meta.get("deadline"),
            "Procedure": meta.get("procedure"),
            "Contact": meta.get("contact"),
            "URL": meta.get("url"),
            "Source": meta.get("source", "Unknown")
        }

        missing_fields = []
        for key, value in fields.items():
            if key in field_aliases:
                if any(alias in user_query.lower() for alias in field_aliases[key]):
                    if not value or "not found" in str(value).lower():
                        missing_fields.append(key)

        block = f"""**{idx}. {name}**\n"""
        for key in ["Description", "Domain", "Eligibility", "Amount", "Deadline", "Procedure", "Contact", "Source"]:
            val = fields[key]

        
            if key == "Deadline":
                if val and "not found" not in str(val).lower():
                    days_left = meta.get("days_left", None)
                    if pd.notnull(days_left):
                        val += f" (🕒 {int(days_left)} days left)"
                else:
                    val = "Not specified"


            if val and "not found" not in str(val).lower():
                block += f"   - **{key}**: {val}\n"

        if missing_fields:
            block += f"   - *Couldn't trace information about {', '.join(missing_fields)}.*\n"

        if fields["URL"]:
            block += f"   - **For more information visit**: {fields['URL']}\n"

        formatted_blocks.append(block)

    return "\n".join(formatted_blocks)

In [75]:
semantic_output = generate_structured_funding_blocks(results_df.to_dict(orient="records"), query)
# Display the formatted output
print(semantic_output)

**1. Funding for projects on the topic of "Application of artificial intelligence (AI) in drug discovery"**
   - **Description**: Guideline for the funding of projects on the topic of "Application of artificial intelligence (AI) in drug discovery" dated: 19.02.2025 Federal Ministry of Education and Research BAnz AT 03.03.2025 B5
   - **Domain**: Research & Innovation (topic-specific), Health & Social Affairs
   - **Eligibility**: University, research institution, company, association/association
   - **Deadline**: Not specified
   - **Contact**: Name: Projektträger Jülich (PtJ), Email: ptj@fz-juelich.de, Phone: Tel: +49 2461 61-2626, Address: 52425 Jülich
   - **Source**: foerderdatenbank
   - **For more information visit**: https://www.foerderdatenbank.de/FDB/Content/DE/Foerderprogramm/Bund/BMBF/anwendung-ki-wirkstoffforschung.html

**2. Promotion of interdisciplinary pilot projects on the topic of "Neurobiologically inspired artificial intelligence"**
   - **Description**: Guideline 

In [53]:
# # %%
# semantic_output = generate_structured_funding_blocks(semantic_matches["matches"], query)
# print(semantic_output)

In [27]:
# # Save the prompt
# with open("llm_prompt_llama.txt", "w") as f:
#     f.write(llm_prompt)

In [28]:
# # Load the prompt
# with open("llm_prompt.txt", "r") as f:
#     llm_prompt = f.read()

In [29]:
# Prepare the LLM prompt
llm_prompt = f"""
The company described itself as:

"{query}"

Here are the top 5 most relevant public funding programs in Germany, based on a semantic search match to their needs:

{semantic_output}

Now:

Please write a concise and professional recommendation containing **only the top 2–3 most relevant funding programs** in this format:

Only select the top programs that most directly match the company’s domain, maturity stage (e.g., early-stage research), or funding needs. Ignore entries that are vague or poorly aligned.

For each recommendation, follow this format exactly:

1. <Program Name>  
Why it fits: <1–2 lines explaining relevance to the company’s domain (or) industry (or) field of work>  
**Description**: <1–3 sentence summary of what the program funds and its focus>  
**Domain**: <Domain>  
**Eligibility**: <Eligibility>  
**Amount**: <Amount>  
**Deadline**: <Deadline>  
**Contact**: <Contact person, email, or organization — not the URL>  
**Next Steps**:  
- Step 1: <Visit the official call page>  
- Step 2: <Provide one or two helpful next steps, e.g., form a consortium, prepare documents>  
- Step 3: (Optional) <Any extra steps from the program's procedure>

If any field like **Amount**, **Deadline**, **Eligibility**, **Procedure**, **Contact**, or **Domain** is missing, either omit the line or say “Not specified”.

Make sure **Next Steps** are formatted as real bullet points on new lines — not in a single line. Use simple and readable language. List only the top 2 or 3 matching programs — not all 5.
"""

In [30]:
# %%
# Run the prompt using LLaMA 3.2 via Ollama
client = Client(host="http://localhost:11434")

response = client.generate(
    model="llama3.2",
    prompt=llm_prompt,
    stream=False
)

In [31]:
# %%
# Output the final result
print("\n🧾 LLaMA 3.2 Recommendation:\n")
print(response['response'])


🧾 LLaMA 3.2 Recommendation:

Here are the top 2-3 most relevant funding programs for the AI company focused on AI for robotics:

1. MASTER 2nd Open Call
Why it fits: This program focuses on developing and validating innovative XR-based educational content for robotics training, which aligns with the company's domain of AI for robotics.
**Description**: MASTER supports projects that develop practical training scenarios for industrial robotics using XR technologies, making them accessible to students and specialists. 
**Domain**: VR/XR
**Eligibility**: Universities, research institutions, educational institutions, SMEs and large companies
**Amount**: Up to 100,000 euros
**Deadline**: June 12, 2025
**Contact**: Name: DLR Projektträger, Email: ge@dlr.de

Next Steps:
• Visit the official call page at https://www.master-xr.eu/open-calls/open-call-2/
• Form a consortium with relevant partners to develop XR-based educational content for robotics training.
• Prepare detailed project proposals 