# Download valid files from the github

In [7]:
import os
import re
import requests
import os
import json
import io
from docx import Document 
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import time
from typing import List
from pinecone import Pinecone, ServerlessSpec
from typing import List
import nbformat
from typing import List
from pathlib import Path
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone_text.sparse import BM25Encoder
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
BM25_PATH  = "data/bm25_encoder.json"  # Path to save/load BM25 encoder state

In [4]:
SKIP_EXTENSIONS = {
    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp",
    ".ico", ".tiff", ".tif", ".mp3", ".mp4", ".wav", ".avi",
    ".mov", ".zip", ".tar", ".gz", ".rar", ".7z", ".pdf",
    ".exe", ".dll", ".so", ".csv", ".tsv"
}

def download_github_repo(repo_url, save_dir="data/repo_files"):
    # remove .git if present
    repo_url = repo_url.replace(".git", "")

    match = re.match(r"https://github.com/([^/]+)/([^/]+)", repo_url)
    if not match:
        raise ValueError("Invalid GitHub URL")

    user, repo = match.groups()
    api_url = f"https://api.github.com/repos/{user}/{repo}/contents"

    os.makedirs(save_dir, exist_ok=True)

    def download_recursive(api_path, local_path):
        response = requests.get(api_path)
        data = response.json()

        # handle API errors (rate limit, not found, etc)
        if isinstance(data, dict) and "message" in data:
            print("❌ GitHub API Error:", data["message"])
            return

        for item in data:
            name = item["name"]
            file_path = os.path.join(local_path, name)

            # skip unwanted folders
            if name.lower() in ["node_modules", ".git"]:
                continue

            if item["type"] == "dir":
                os.makedirs(file_path, exist_ok=True)
                download_recursive(item["url"], file_path)

            elif item["type"] == "file":
                ext = os.path.splitext(name)[1].lower()
                if ext in SKIP_EXTENSIONS:
                    continue

                file_data = requests.get(item["download_url"]).content
                with open(file_path, "wb") as f:
                    f.write(file_data)

                print("Downloaded:", file_path)

    download_recursive(api_url, save_dir)
    print("\n✔ Download complete!")

In [5]:
download_github_repo("https://github.com/Mageshwaran18/Micro_Doppler_Based_Target_Classification.git")

Downloaded: data/repo_files\.gitignore
Downloaded: data/repo_files\Basic_Data_Preprocessing.ipynb
Downloaded: data/repo_files\DataSet_ReadMe.txt
Downloaded: data/repo_files\Detailed End to End Documentation.md
Downloaded: data/repo_files\Micro-Doppler-Based-Target-Classification.pptx
Downloaded: data/repo_files\README.md
Downloaded: data/repo_files\app.py
Downloaded: data/repo_files\main.ipynb
Downloaded: data/repo_files\random_forest_model.pkl

✔ Download complete!


# Convert those downloaded files to a common txt format


In [6]:
TEXT_EXTENSIONS = {".txt", ".py", ".md", ".json", ".csv", ".yaml", ".yml", ".html", ".js"}

def convert_repo_to_text(input_dir="data/repo_files", output_file="data/combined_repo.txt"):
    all_texts = []

    for root, dirs, files in os.walk(input_dir):
        for file in files:
            file_path = os.path.join(root, file)
            ext = os.path.splitext(file)[1].lower()

            try:
                if ext in TEXT_EXTENSIONS:
                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                        content = f.read()
                        all_texts.append(f"\n\n===== FILE: {file} =====\n\n{content}")

                elif ext == ".ipynb":  # handle Jupyter notebooks
                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                        nb = nbformat.read(f, as_version=4)
                        cells_text = []
                        for cell in nb.cells:
                            if cell.cell_type == "code" or cell.cell_type == "markdown":
                                cells_text.append(cell.source)
                        all_texts.append(f"\n\n===== FILE: {file} =====\n\n" + "\n\n".join(cells_text))

                else:
                    # skip binary files / unsupported files
                    continue

            except Exception as e:
                print(f"❌ Could not read {file_path}: {e}")

    # Write all collected text to a single txt file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n\n".join(all_texts))

    print(f"\n✔ All files combined into: {output_file}")

In [7]:
convert_repo_to_text("data/repo_files", "data/combined_repo.txt")


✔ All files combined into: data/combined_repo.txt


# Flaten the text files


In [None]:
def flatten_single_file(input_file, output_file="data/flattened_repo.txt"):
    """
    Reads a single text file, flattens the text (removes extra newlines/whitespace),
    and writes the flattened text to output_file.
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    try:
        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()

        # Flatten the text
        # 1. Replace multiple newlines with one
        # 2. Replace multiple spaces/tabs with one space
        flat_text = re.sub(r"\n+", "\n", text)
        flat_text = re.sub(r"[ \t]+", " ", flat_text)
        flat_text = flat_text.strip()

        with open(output_file, "w", encoding="utf-8") as out:
            out.write(flat_text)

        print(f"\n✔ Flattened file saved as: {output_file}")

    except Exception as e:
        print(f"❌ Error flattening file {input_file}: {e}")

In [9]:
flatten_single_file("data/combined_repo.txt", "data/flattened_repo.txt")


✔ Flattened file saved as: data/flattened_repo.txt


# Flaten txt to single line String

In [10]:
def convert_to_single_line(input_file="data/flattened_repo.txt"):
    """
    Reads a flattened text file and converts all contents into a single-line string.
    Returns the single-line string.
    """
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()

        # Convert to single line: replace newlines with space and remove extra spaces
        single_line = " ".join(text.split())

        return single_line

    except Exception as e:
        print(f"❌ Error processing file {input_file}: {e}")
        return ""


# Initilize pinecone


In [9]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = "rbi"
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [12]:

def initialize_pinecone():
    """
    Initialize Pinecone index for hybrid search.
    """
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing = [i.name for i in pc.list_indexes()]
    
    # Delete existing index if dimension mismatch
    if INDEX_NAME in existing:
        print(f"Deleting existing index: {INDEX_NAME}")
        pc.delete_index(INDEX_NAME)
        time.sleep(5)  # Wait for deletion to complete
    
    print(f"Creating Pinecone index: {INDEX_NAME}")
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,  # all-MiniLM-L6-v2 dimension
        metric='dotproduct',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    time.sleep(30)
    
    return pc.Index(INDEX_NAME)

In [13]:
async def store_terms_to_pinecone(path="data/flattened_repo.txt"):
    combined_text = convert_to_single_line(path)
    print("\n✔ Converted text to single line string")

    # Split text into manageable chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )

    chunks = splitter.split_text(combined_text)
    print(f"  Split into {len(chunks)} text chunks")

    # Remove empty chunks
    chunks = [c.strip() for c in chunks if c.strip()]
    print(f"  Filtered to {len(chunks)} non-empty chunks")

    # Initialize Pinecone
    index = initialize_pinecone()

    # Embedding model
    print("\n  Setting up embedding + sparse encoders...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True}
    )

    # Initialize BM25 BEFORE using it
    bm25 = BM25Encoder().default()
    bm25.fit(chunks)
    bm25.dump(BM25_PATH)
    print("   ✓ BM25 parameters saved")

    # Create Hybrid retriever
    retriever = PineconeHybridSearchRetriever(
        embeddings=embeddings,
        sparse_encoder=bm25,
        index=index
    )

    # Filter chunks with empty sparse vectors
    print("\nFiltering chunks with empty BM25 vectors...")
    valid_chunks = []
    for c in chunks:
        sparse = bm25.encode_queries([c])[0]  # returns dict: {"indices":..., "values":...}

        # check if sparse vector has non-empty indices
        if sparse and len(sparse.get("indices", [])) > 0:
            valid_chunks.append(c)

    print(f"  {len(valid_chunks)} chunks have valid sparse representations")
    
    if not valid_chunks:
        raise ValueError("All chunks resulted in empty sparse vectors. Nothing to upload.")

    # Upload only valid chunks
    print("\n Uploading valid chunks to Pinecone...")
    retriever.add_texts(valid_chunks)
    print(f" Successfully stored {len(valid_chunks)} chunks in Pinecone")

    # Save config for debugging
    config = {
        "index_name": INDEX_NAME,
        "chunk_count": len(valid_chunks),
        "timestamp": time.time(),
        "initialized": True
    }

    print(config)
    return retriever


In [14]:
retriever = await store_terms_to_pinecone()


✔ Converted text to single line string
  Split into 170 text chunks
  Filtered to 170 non-empty chunks
Deleting existing index: rbi
Creating Pinecone index: rbi

  Setting up embedding + sparse encoders...


100%|██████████| 170/170 [00:00<00:00, 798.98it/s]


   ✓ BM25 parameters saved

Filtering chunks with empty BM25 vectors...
  169 chunks have valid sparse representations

 Uploading valid chunks to Pinecone...


100%|██████████| 6/6 [00:12<00:00,  2.06s/it]

 Successfully stored 169 chunks in Pinecone
{'index_name': 'rbi', 'chunk_count': 169, 'timestamp': 1763439949.949413, 'initialized': True}





# Rag chain building and Question generation


In [10]:
# LLM initialization (your config)
llm = ChatGroq(
        model="llama-3.3-70b-versatile",
        temperature=0.2,
        max_tokens=1024,
        groq_api_key=GROQ_API_KEY
    )

In [None]:
# Create prompt
prompt = ChatPromptTemplate.from_template("""
You are a senior software engineer conducting an interview. 
Use the context below (project code and documentation) to generate interview questions.

Context:
{context}

Instructions:
- Generate 3 meaningful technical questions about the project
- Questions must be specific to the code, design, architecture, or data structures
- A question should consist of only one question. Not a combined form of 2 or 3 question                                          
- Don't ask fundamental questions about the project                                        
- Ask questions like:
    - Why did you choose this data structure or algorithm?
    - Why this design pattern or component was used?
    - Explain the purpose of a specific function/module
    - Trade-offs or alternatives in the code
- Make the questions suitable for a real-world technical interview
- Number the questions from 1 to 3
- Only use information present in the context

JSON Output Format (STRICT):

{{
  "interview_questions": [
    {{
      "question_number": <number>,
      "question": "<text>"
    }}
  ]
}}

Additional Output Rules:
- Return ONLY JSON (no markdown, no explanation, no comments)
- Start question_number at 1 and increment to 5
- JSON must be valid and parsable
- No trailing commas, no extra text outside JSON


Return ONLY valid JSON in the exact format above.
                                          
""")

# Create RAG chain
rag_chain = (
    {
        "context": retriever,          # Pass the chunks from vector DB
        "question": RunnablePassthrough()  # Not needed, but keeps API consistent
    }
    | prompt
    | llm
    | StrOutputParser()
)

# Invoke the chain
question_input = "Generate interview questions based on this project."
answer = rag_chain.invoke(question_input)


# Store the questions in the json file
parsed = json.loads(answer)
with open("data/questions.json", "w", encoding="utf-8") as f:
    json.dump(parsed, f, indent=2)


# Get source documents (chunks used for generating questions)
docs = retriever.invoke(question_input)

# Output the questions
print(answer)

{
  "interview_questions": [
    {
      "question_number": 1,
      "question": "Why was the Continuous Wavelet Transform (CWT) chosen for feature extraction in the model building and training process?"
    },
    {
      "question_number": 2,
      "question": "What was the reasoning behind using a train-test split in the model training process, and what split ratio was used?"
    },
    {
      "question_number": 3,
      "question": "Why was seaborn's heatmap chosen for visualizing the Confusion Matrix, and what benefits does it provide over other visualization methods?"
    }
  ]
}


# Asking the question to the user

In [None]:
def conduct_interview_and_save():
    """Reads questions.json from data/, asks each question, 
    collects user answers, and writes evaluation.json."""
    
    input_path = os.path.join("data", "questions.json")
    output_path = os.path.join("data", "evaluation.json")

    # --- Load questions.json ---
    if not os.path.exists(input_path):
        raise FileNotFoundError("questions.json not found in /data directory")

    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    questions = data.get("interview_questions", [])
    if not questions:
        raise ValueError("No questions found in questions.json")

    # --- Collect answers from user ---
    evaluation_output = {"evaluation": []}

    print("\nStarting Interview...\n")

    for q in questions:
        print(f"Q{q['question_number']}: {q['question']}")
        answer = input("Your Answer: ")
        print()

        evaluation_output["evaluation"].append({
            "question_number": q["question_number"],
            "question": q["question"],
            "user_answer": answer
        })

    # --- Save evaluation.json ---
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(evaluation_output, f, indent=4)

    print(f"Interview completed successfully!")
    print(f"Responses saved to {output_path}")


In [3]:
conduct_interview_and_save()


Starting Interview...

Q1: Why was the Continuous Wavelet Transform (CWT) chosen for feature extraction in the model building and training process?

Q2: What was the reasoning behind using a train-test split in the model training process, and what split ratio was used?

Q3: Why was seaborn's heatmap chosen for visualizing the Confusion Matrix, and what benefits does it provide over other visualization methods?

Interview completed successfully!
Responses saved to data\evaluation.json


# Evaluate the user's answer

In [11]:
def evaluate_answers_with_llm():
    input_path = os.path.join("data", "evaluation.json")
    output_path = os.path.join("data", "final_output.json")

    # Load evaluation.json
    if not os.path.exists(input_path):
        raise FileNotFoundError("evaluation.json not found inside /data directory")

    with open(input_path, "r", encoding="utf-8") as f:
        evaluation_data = json.load(f)

    questions_list = evaluation_data.get("evaluation", [])
    if not questions_list:
        raise ValueError("No evaluation data found in evaluation.json")

    final_output = {"final_evaluation": []}

    # Process each question through LLM
    for item in questions_list:
        q_number = item["question_number"]
        question = item["question"]
        user_answer = item["user_answer"]

        prompt = f"""
You are an evaluator. 
Evaluate the user's answer to the interview question.

Question:
{question}

User Answer:
{user_answer}

Give marks out of 5. 
Only respond in the following JSON format:

{{
  "marks": <number 0-5>,
  "justification": "<short explanation>"
}}
"""

        response = llm.invoke(prompt)
        llm_output = response.content.strip()

        try:
            parsed = json.loads(llm_output)
        except:
            parsed = {"marks": 0, "justification": "LLM response not valid JSON."}

        final_output["final_evaluation"].append({
            "question_number": q_number,
            "question": question,
            "user_answer": user_answer,
            "marks": parsed.get("marks", 0),
            "justification": parsed.get("justification", "")
        })

    # Save final_output.json
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_output, f, indent=4)

    print("✔ Evaluation complete.")
    print(f"Saved to {output_path}")

    return final_output

In [14]:
evaluate_answers_with_llm()

✔ Evaluation complete.
Saved to data\final_output.json


{'final_evaluation': [{'question_number': 1,
   'question': 'Why was the Continuous Wavelet Transform (CWT) chosen for feature extraction in the model building and training process?',
   'user_answer': 'The Continuous Wavelet Transform (CWT) was chosen because it provides a rich time–frequency representation, allowing both temporal and spectral patterns to be captured simultaneously. This makes it ideal for analyzing non-stationary signals and improves the quality of extracted features for the model.',
   'marks': 5,
   'justification': 'The user provided a clear and accurate explanation of the benefits of using CWT for feature extraction, highlighting its ability to capture both temporal and spectral patterns in non-stationary signals.'},
  {'question_number': 2,
   'question': 'What was the reasoning behind using a train-test split in the model training process, and what split ratio was used?',
   'user_answer': 'A train-test split wasn’t necessary because the model can evaluate itse