In [None]:
!pip install langchain langchain-openai python-dotenv requests gradio -q

In [None]:
from google.colab import userdata
NEBIUS_KEY = userdata.get("NEBIUS_API_KEY")
OPENAI_KEY = userdata.get("OPENAI_API_KEY")
NEBIUS_BASE = "https://api.studio.nebius.com/v1"

GIT_API_KEY = userdata.get("GIT_API_KEY")
HEADERS = {"Authorization": f"token {GIT_API_KEY}"}

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    base_url=NEBIUS_BASE,
    api_key=NEBIUS_KEY,
    temperature=0.6,
    max_tokens=1024,
)
print("LLM Ready")

LLM Ready


In [None]:
from openai import OpenAI
client = OpenAI(api_key=NEBIUS_KEY, base_url=NEBIUS_BASE)

def classify_domain(user_input):
    prompt = f"""
    Identify the technical domain of the following user query.

    Possible domains:
    - web development
    - backend engineering
    - frontend engineering
    - networking
    - data science
    - machine learning
    - artificial intelligence
    - algorithms
    - operating systems
    - cybersecurity
    - devops
    - mobile development
    - databases
    - cloud computing
    - robotics
    - electronics
    - embedded systems
    - computer vision
    - software architecture
    - APIs and protocols

    User Input: "{user_input}"

    Respond with ONE domain only.
    """

    res = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages=[{"role": "user", "content": prompt}]
    )

    return res.choices[0].message.content.strip().lower()

In [None]:
def technical_query(user_input, domain):
    prompt = f"""
    Convert the user input into a concise technical query (3â€“5 words)
    using domain-specific terminology.

    Domain: {domain}

    RULES:
    - Output must be a technical noun phrase.
    - Do NOT produce a sentence.
    - Do NOT format the ouput in any way. Return plain text.
    - Use domain-specific words like:
        architecture, pipeline, connectivity,
        interfacing, synchronization, protocols, binding,
        backend, frontend, rendering, compute, framework.

    User Input: "{user_input}"

    Technical Query:
    """

    res = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages=[{"role": "user", "content": prompt}]
    )

    return res.choices[0].message.content.strip()

In [None]:
def make_query(user_input):
    domain = classify_domain(user_input)
    query = technical_query(user_input, domain)
    return query

In [None]:
import requests

forks = 0
sort = "stars"
max_results = 5

def scraped_data(query):
  url = f"https://api.github.com/search/repositories?q={query}+forks:>{forks}&sort=stars&order=desc&per_page={max_results}&page=1"

  res = requests.get(url, headers=HEADERS)
  data = res.json()

  return data

In [None]:
def fetch_repo_urls(query):

  repo_urls = []
  data = scraped_data(query)

  for item in data.get("items"):
    repo_urls.append(item["html_url"])
  return repo_urls

  return repo_urls

In [None]:
def fetch_repo_data(repo_url):
    """
    Fetch repository information from GitHub API
    """
    try:
        parts = repo_url.strip().rstrip('/').split('/')
        owner = parts[-2]
        repo_name = parts[-1]
        api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
        response = requests.get(api_url, headers=HEADERS)
        if response.status_code != 200:
            return None, f"Error: Unable to fetch repository data (Status {response.status_code})"
        repo_data = response.json()
        readme_url = f"https://api.github.com/repos/{owner}/{repo_name}/readme"
        readme_response = requests.get(readme_url, headers=HEADERS)
        readme_content = ""
        if readme_response.status_code == 200:
            import base64
            readme_data = readme_response.json()
            readme_content = base64.b64decode(readme_data['content']).decode('utf-8')[:2000]
        compiled_data = {
            "name": repo_data.get("name", "N/A"),
            "description": repo_data.get("description", "N/A"),
            "language": repo_data.get("language", "N/A"),
            "stars": repo_data.get("stargazers_count", 0),
            "forks": repo_data.get("forks_count", 0),
            "topics": repo_data.get("topics", []),
            "readme_snippet": readme_content,
        }
        return compiled_data, None
    except Exception as e:
        return None, f"Error parsing repository: {str(e)}"

In [None]:
def list_repo_data(repo_urls):

  all_repos_data = [] #list of dicts

  for repo_url in repo_urls:
    repo_data = fetch_repo_data(repo_url)
    all_repos_data.append(repo_data[0])

  return all_repos_data

In [None]:

from langchain.prompts import PromptTemplate
import json

def filter_and_summarize_repos(llm, user_prompt, repo_data_list):
    """Feed top 5 repos to LLM B and get comparative summary + best repo suggestion."""
    context = ""
    for repo in repo_data_list:
        context += f"Repo:\nName: {repo['name']}\nDescription: {repo.get('description', '')}\nStars: {repo.get('stars', 0)}\nLanguage: {repo.get('language', '')}\nREADME (truncated):\n{repo.get('readme', '')[:1000]}\n\n"

    template = PromptTemplate(
        input_variables=["user_prompt", "context"],
        template="""You are an expert assistant comparing GitHub repositories for a user project.

User's request:
{user_prompt}

Below are summaries of a few repositories. Analyze and answer:

1. Which repository is MOST relevant to the user's goal (dont refer to it by number)?
2. Why? Give a short justification.
3. Provide a short summary of that repo:
  a. Provide a clear, concise summary of the repository
  b. Identify the main purpose and technology stack
  c. Highlight notable features or patterns
  d. Give instructions on how to use the repository
Be specific, helpful, and constructive.

Repositories data:
{context}
""",
    )

    prompt_text = template.format(user_prompt=user_prompt, context=context)
    response = llm.invoke(prompt_text)

    return response.content


In [None]:
def gradio_wrapper(user_input):
  repo_urls = fetch_repo_urls(query)
  all_repos_data = list_repo_data(repo_urls)
  output = filter_and_summarize_repos(llm, user_input, all_repos_data)
  return output

In [None]:
import gradio as gr
with gr.Blocks() as demo:
    gr.Markdown("""
<h1 style='text-align:center;'>
GitGud
</h1>""")
    link = gr.Textbox(label="What can I help you with?")
    btn = gr.Button("Analyze")
    output = gr.Markdown()
    btn.click(gradio_wrapper, inputs=link, outputs=output)
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c7427195cc44f6f963.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


