# Download valid files from the github

In [8]:
import os
import re
import requests
import os
import json
import io
from docx import Document 
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import time
from typing import List
from pinecone import Pinecone, ServerlessSpec
from typing import List
import nbformat
from typing import List
from pathlib import Path
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone_text.sparse import BM25Encoder
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
BM25_PATH  = "bm25_encoder.json"  # Path to save/load BM25 encoder state

In [7]:
SKIP_EXTENSIONS = {
    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp",
    ".ico", ".tiff", ".tif", ".mp3", ".mp4", ".wav", ".avi",
    ".mov", ".zip", ".tar", ".gz", ".rar", ".7z", ".pdf",
    ".exe", ".dll", ".so", ".csv", ".tsv"
}

def download_github_repo(repo_url, save_dir="data/repo_files"):
    # remove .git if present
    repo_url = repo_url.replace(".git", "")

    match = re.match(r"https://github.com/([^/]+)/([^/]+)", repo_url)
    if not match:
        raise ValueError("Invalid GitHub URL")

    user, repo = match.groups()
    api_url = f"https://api.github.com/repos/{user}/{repo}/contents"

    os.makedirs(save_dir, exist_ok=True)

    def download_recursive(api_path, local_path):
        response = requests.get(api_path)
        data = response.json()

        # handle API errors (rate limit, not found, etc)
        if isinstance(data, dict) and "message" in data:
            print("❌ GitHub API Error:", data["message"])
            return

        for item in data:
            name = item["name"]
            file_path = os.path.join(local_path, name)

            # skip unwanted folders
            if name.lower() in ["node_modules", ".git"]:
                continue

            if item["type"] == "dir":
                os.makedirs(file_path, exist_ok=True)
                download_recursive(item["url"], file_path)

            elif item["type"] == "file":
                ext = os.path.splitext(name)[1].lower()
                if ext in SKIP_EXTENSIONS:
                    continue

                file_data = requests.get(item["download_url"]).content
                with open(file_path, "wb") as f:
                    f.write(file_data)

                print("Downloaded:", file_path)

    download_recursive(api_url, save_dir)
    print("\n✔ Download complete!")

In [8]:
download_github_repo("https://github.com/Mageshwaran18/Music_Popularity_Prediction.git")

Downloaded: data/repo_files\Music_Popularity_Prediction.ipynb
Downloaded: data/repo_files\README.md
Downloaded: data/repo_files\requirements.txt

✔ Download complete!


# Convert those downloaded files to a common txt format


In [9]:
TEXT_EXTENSIONS = {".txt", ".py", ".md", ".json", ".csv", ".yaml", ".yml", ".html", ".js"}

def convert_repo_to_text(input_dir="data/repo_files", output_file="data/combined_repo.txt"):
    all_texts = []

    for root, dirs, files in os.walk(input_dir):
        for file in files:
            file_path = os.path.join(root, file)
            ext = os.path.splitext(file)[1].lower()

            try:
                if ext in TEXT_EXTENSIONS:
                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                        content = f.read()
                        all_texts.append(f"\n\n===== FILE: {file} =====\n\n{content}")

                elif ext == ".ipynb":  # handle Jupyter notebooks
                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                        nb = nbformat.read(f, as_version=4)
                        cells_text = []
                        for cell in nb.cells:
                            if cell.cell_type == "code" or cell.cell_type == "markdown":
                                cells_text.append(cell.source)
                        all_texts.append(f"\n\n===== FILE: {file} =====\n\n" + "\n\n".join(cells_text))

                else:
                    # skip binary files / unsupported files
                    continue

            except Exception as e:
                print(f"❌ Could not read {file_path}: {e}")

    # Write all collected text to a single txt file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n\n".join(all_texts))

    print(f"\n✔ All files combined into: {output_file}")

In [10]:
convert_repo_to_text("data/repo_files", "data/combined_repo.txt")


✔ All files combined into: data/combined_repo.txt


# Flaten the text files


In [3]:
import os
import re

def flatten_single_file(input_file, output_file="data/flattened_repo.txt"):
    """
    Reads a single text file, flattens the text (removes extra newlines/whitespace),
    and writes the flattened text to output_file.
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    try:
        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()

        # Flatten the text
        # 1. Replace multiple newlines with one
        # 2. Replace multiple spaces/tabs with one space
        flat_text = re.sub(r"\n+", "\n", text)
        flat_text = re.sub(r"[ \t]+", " ", flat_text)
        flat_text = flat_text.strip()

        with open(output_file, "w", encoding="utf-8") as out:
            out.write(flat_text)

        print(f"\n✔ Flattened file saved as: {output_file}")

    except Exception as e:
        print(f"❌ Error flattening file {input_file}: {e}")

In [4]:
flatten_single_file("data/combined_repo.txt", "data/flattened_repo.txt")


✔ Flattened file saved as: data/flattened_repo.txt


# Flaten txt to single line String

In [5]:
def convert_to_single_line(input_file="data/flattened_repo.txt"):
    """
    Reads a flattened text file and converts all contents into a single-line string.
    Returns the single-line string.
    """
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()

        # Convert to single line: replace newlines with space and remove extra spaces
        single_line = " ".join(text.split())

        return single_line

    except Exception as e:
        print(f"❌ Error processing file {input_file}: {e}")
        return ""


# Initilize pinecone


In [20]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = "rbi"
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [15]:

def initialize_pinecone():
    """
    Initialize Pinecone index for hybrid search.
    """
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing = [i.name for i in pc.list_indexes()]
    
    # Delete existing index if dimension mismatch
    if INDEX_NAME in existing:
        print(f"Deleting existing index: {INDEX_NAME}")
        pc.delete_index(INDEX_NAME)
        time.sleep(5)  # Wait for deletion to complete
    
    print(f"Creating Pinecone index: {INDEX_NAME}")
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,  # all-MiniLM-L6-v2 dimension
        metric='dotproduct',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    time.sleep(30)
    
    return pc.Index(INDEX_NAME)

In [21]:
async def store_terms_to_pinecone(path="data/flattened_repo.txt"):
    print("Hi")
    combined_text = convert_to_single_line(path)
    print("\n✔ Converted text to single line string")

    # Split text into manageable chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )

    chunks = splitter.split_text(combined_text)
    print(f"  Split into {len(chunks)} text chunks")

    # Initialize Pinecone with better waiting
    index = initialize_pinecone()

    # Initialize Embeddings and BM25
    print("\n  Setting up embedding + sparse encoders...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True}
    )
    
    bm25 = BM25Encoder().default()
    bm25.fit(chunks)
    bm25.dump(BM25_PATH)
    print("   ✓ BM25 parameters saved")

    # Create retriever and upload vectors
    retriever = PineconeHybridSearchRetriever(
        embeddings=embeddings,
        sparse_encoder=bm25,
        index=index
    )

    print("\n Uploading text chunks to Pinecone...")
    retriever.add_texts(chunks)
    print(f" Successfully stored {len(chunks)} chunks in Pinecone")
    
    config = {
        "index_name": INDEX_NAME,
        "chunk_count": len(chunks),
        "timestamp": time.time(),
        "initialized": True
    }

    print(config)
    return retriever



In [22]:
retriever = await store_terms_to_pinecone()

Hi

✔ Converted text to single line string
  Split into 37 text chunks
Deleting existing index: rbi
Creating Pinecone index: rbi

  Setting up embedding + sparse encoders...


100%|██████████| 37/37 [00:00<00:00, 1367.41it/s]


   ✓ BM25 parameters saved

 Uploading text chunks to Pinecone...


100%|██████████| 2/2 [00:04<00:00,  2.33s/it]

 Successfully stored 37 chunks in Pinecone
{'index_name': 'rbi', 'chunk_count': 37, 'timestamp': 1763363818.2685537, 'initialized': True}





In [24]:
llm = ChatGroq(
        model="llama-3.3-70b-versatile",
        temperature=0.2,
        max_tokens=1024,
        groq_api_key=GROQ_API_KEY
    )
    
    # Create prompt
prompt = ChatPromptTemplate.from_template("""
You are a senior software engineer conducting an interview. 
Use the context below (project code and documentation) to generate interview questions.

Context:
{context}

Instructions:
- Generate 10 meaningful technical questions about the project
- Questions must be specific to the code, design, architecture, or data structures
- Ask questions like:
    - Why did you choose this data structure or algorithm?
    - Why this design pattern or component was used?
    - Explain the purpose of a specific function/module
    - Trade-offs or alternatives in the code
- Make the questions suitable for a real-world technical interview
- Number the questions from 1 to 10
- Only use information present in the context

Output format:
1. Question 1
2. Question 2
...
10. Question 10
""")

# Create RAG chain
rag_chain = (
    {
        "context": retriever,          # Pass the chunks from vector DB
        "question": RunnablePassthrough()  # Not needed, but keeps API consistent
    }
    | prompt
    | llm
    | StrOutputParser()
)

# Invoke the chain
question_input = "Generate interview questions based on this project."
answer = rag_chain.invoke(question_input)

# Get source documents (chunks used for generating questions)
docs = retriever.invoke(question_input)

# Output the questions
print(answer)

1. What motivated the choice of XGBoost Regressor for modeling in this music popularity prediction project, and how does it compare to other regression algorithms like LightGBM, which is also included in the requirements?
2. Can you explain the reasoning behind removing rows with missing values during the preprocessing step, and are there any alternative strategies you would consider for handling missing data in this context?
3. How does the project's use of encoding for categorical columns impact the performance of the predictive model, and what encoding strategies were considered or implemented?
4. The project splits the data into training and testing sets; what considerations were taken into account when deciding on the split ratio, and how might this affect the model's generalizability?
5. The evaluation metrics mentioned include MSE, MAE, and R²; can you discuss why these specific metrics were chosen for assessing the model's performance, and are there any other metrics you would 