In [4]:
import pandas as pd
import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

def fetch_interview_links(company_to_filter, role_to_filter, pages_to_scrape):
    """
    Opens a browser, filters for company and role, and collects interview links.
    Returns a list of dictionaries, e.g., [{'title': '...', 'url': '...'}].
    """
    print("--- Step 1: Fetching interview links ---")
    target_url = "https://www.naukri.com/code360/interview-experiences"
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 15)
    all_results = []

    try:
        driver.get(target_url)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "codingninjas-interview-experience-card-v2")))

        # --- Company Filter ---
        print(f"Filtering for company: {company_to_filter}...")
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#right-section-container codingninjas-ie-company-dropdown-widget > div"))).click()
        comp_input = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[placeholder='Search']")))
        comp_input.send_keys(company_to_filter)
        time.sleep(2)
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "mat-radio-button.mat-radio-button"))).click()
        time.sleep(1)

        # --- Role Filter ---
        print(f"Filtering for role: {role_to_filter}...")
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#right-section-container codingninjas-ie-roles-dropdown-widget:nth-child(2) > div"))).click()
        role_input = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "codingninjas-ie-roles-dropdown-widget input[placeholder='Search']")))
        role_input.send_keys(role_to_filter)
        time.sleep(2)
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "codingninjas-ie-roles-dropdown-widget mat-checkbox"))).click()
        time.sleep(1)

        # --- Newest Filter---
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
        "#right-section-container .desktop-sort.newest"))).click()
        time.sleep(1)
        
        # --- Pagination and Link Collection ---
        for page in range(1, pages_to_scrape + 1):
            print(f"Collecting links from page {page}...")
            time.sleep(2)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.interview-experiences-list-section.ng-star-inserted")))
            cards = driver.find_elements(By.TAG_NAME, "codingninjas-interview-experience-card-v2")
            for card in cards:
                try:
                    anchor = card.find_element(By.CSS_SELECTOR, "a.interview-exp-title")
                    href = anchor.get_attribute("href")
                    text = anchor.text.strip()
                    if href and text:
                        all_results.append({"title": text, "url": href})
                except NoSuchElementException:
                    continue
            
            if page >= pages_to_scrape:
                break
            
            try:
                next_page_link = wait.until(EC.element_to_be_clickable((By.XPATH, f"//codingninjas-page-nav-v2//a[normalize-space(text())='{page + 1}']")))
                driver.execute_script("arguments[0].click();", next_page_link)
            except TimeoutException:
                print(f"Could not find link for page {page + 1}. Stopping link collection.")
                break
            
    except Exception as e:
        print(f"An error occurred while fetching links: {e}")
    finally:
        driver.quit()
        print(f"Found {len(all_results)} links to scrape.")
        return all_results
    
def scrape_interview_details(url):
    driver = None
    try:
        options = webdriver.ChromeOptions()
        options.add_argument('--headless=new')  # Use new headless mode
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--log-level=3')
        options.add_argument('user-agent=Mozilla/5.0')
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        driver.get(url)
        time.sleep(5)

        parts = []
        try:
            # Expand journey section
            try:
                btn = driver.find_element(By.CSS_SELECTOR, "#continue-reading-ie-cta-container button")
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(1)
            except:
                pass

            journey = driver.find_element(By.CSS_SELECTOR, "#ie-overall-user-experience").text.strip()
            if journey:
                parts.append("## Interview Preparation Journey\n" + journey)
        except NoSuchElementException:
            pass

        # Extract rounds
        round_index = 1
        rounds_found = False
        while True:
            try:
                round_id = f"interview-round-v2-{round_index}"
                round_container = driver.find_element(By.ID, round_id)
                round_text = round_container.text.strip()

                if not rounds_found:
                    parts.append("\n\n## Interview Rounds")
                    rounds_found = True

                # Extract problem links by clicking
                links = []
                try:
                    problems = round_container.find_elements(By.CSS_SELECTOR, "codingninjas-interview-round-problem")
                    for prob in problems:
                        try:
                            link_btn = prob.find_element(By.CSS_SELECTOR, ".try-now-solve-later-container a")
                            original_window = driver.current_window_handle
                            driver.execute_script("arguments[0].click();", link_btn)
                            time.sleep(2)

                            WebDriverWait(driver, 5).until(EC.number_of_windows_to_be(2))
                            new_window = [w for w in driver.window_handles if w != original_window][0]
                            driver.switch_to.window(new_window)
                            time.sleep(2)

                            current_url = driver.current_url
                            links.append(current_url)

                            driver.close()
                            driver.switch_to.window(original_window)

                        except Exception as e:
                            links.append("null")
                except:
                    links.append("null")

                safe_links_string = ", ".join(link for link in links if link and link != "null")
                round_text += f"\n\n🔗 Problem Links: {safe_links_string if safe_links_string else 'null'}"

                parts.append(f"\n\n### Round {round_index}\n{round_text}")
                round_index += 1

            except NoSuchElementException:
                break

        # Fallback
        if not parts:
            try:
                content = driver.find_element(By.CSS_SELECTOR, "div.blog-body-content").text.strip()
                if content:
                    parts.append(content)
            except:
                return None

        return "\n".join(parts)

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None
    finally:
        if driver:
            driver.quit()

def main():
    """
    Main function to get user input, fetch links, scrape details, and return a DataFrame.
    """
    # --- Get User Input ---
    company_to_filter = input("Enter the company name to search for: ").strip()
    role_to_filter_input = input("Enter the role name (e.g., SDE 1, SDE-2): ").strip()
    
    # Normalize role input for better matching
    role_to_filter = re.sub(r'\s*-\s*', ' - ', role_to_filter_input)
    role_to_filter = " ".join(role_to_filter.split()).upper()

    try:
        pages_to_scrape = int(input("How many pages of results do you want to scrape? "))
    except ValueError:
        print("Invalid number. Defaulting to 1 page.")
        pages_to_scrape = 1

    # --- Step 1: Fetch all the links first ---
    links_to_process = fetch_interview_links(company_to_filter, role_to_filter, pages_to_scrape)

    if not links_to_process:
        print("No interview links found for the given criteria. Exiting.")
        return None # Return None if no links were found

    # --- Step 2: Scrape the details from each link ---
    print("\n--- Step 2: Scraping details from each link ---")
    scraped_data = []
    total_links = len(links_to_process)
    for i, item in enumerate(links_to_process):
        url = item.get('URL') or item.get('url')
        title = item.get('Title') or item.get('title')
        
        print(f"Scraping link {i + 1}/{total_links}: {title}...")
        description = scrape_interview_details(url)

        if description:
            try:
                parts = [part.strip() for part in title.split('|')]
                company = parts[0]
                role = parts[1]
            except (IndexError, AttributeError):
                company = company_to_filter
                role = role_to_filter_input
            
            scraped_data.append({"company": company, "role": role, "description": description})
            print(f"  -> Success.")
        else:
            print(f"  -> Failed to retrieve data.")

    # --- Step 3: Create and return the pandas DataFrame ---
    if scraped_data:
        print("\n✅ All done! Creating final pandas DataFrame.")
        # Create the DataFrame from the collected data
        output_df = pd.DataFrame(scraped_data)
        # Return the DataFrame so it can be used by other parts of your application
        return output_df
    else:
        print("\nNo data was successfully scraped.")
        return None

if __name__ == "__main__":
    # The main() function is called, and its return value (the DataFrame) is stored
    final_dataframe = main()
    
    # Check if the DataFrame was created successfully and print it
    if final_dataframe is not None:
        print("\n--- Final DataFrame ---")
        print(final_dataframe)

--- Step 1: Fetching interview links ---
Filtering for company: oracle...
Filtering for role: SDE - 1...
Collecting links from page 1...
Found 6 links to scrape.

--- Step 2: Scraping details from each link ---
Scraping link 1/6: Oracle | SDE - 1 | Experienced | Mar 2023...
  -> Success.
Scraping link 2/6: Oracle | SDE - 1 | Experienced | Mar 2023...
  -> Success.
Scraping link 3/6: Oracle | SDE - 1 | Experienced | Oct 2022...
  -> Success.
Scraping link 4/6: Oracle | SDE - 1 | Fresher | Sep 2022...
  -> Success.
Scraping link 5/6: Oracle | SDE - 1 | Experienced | Sep 2022...
  -> Success.
Scraping link 6/6: Oracle | SDE - 1 | Fresher | Aug 2022...
  -> Success.

✅ All done! Creating final pandas DataFrame.

--- Final DataFrame ---
  company     role                                        description
0  Oracle  SDE - 1  ## Interview Preparation Journey\nInterview pr...
1  Oracle  SDE - 1  ## Interview Preparation Journey\nInterview pr...
2  Oracle  SDE - 1  ## Interview Preparation Jou

In [5]:
print(final_dataframe['description'][1])

## Interview Preparation Journey
Interview preparation journey
Journey
Hi All, I'm Sanjay Sharma, I would like to share my journey. So i was looking for switch the jobs, so i've applied from linkedin, naukri for different different companies. But i didn't get any response, later I found there was my friend who is currently working in Oracle, so I applied there from his reference, and i got interview call by them, there were 4 round of interview.
Application story
In start i applied from many online portals like naukri, linkedin and other also many sites but from this i didn't get any response, later I found that one of my friend working in Oracle so i applied through his referral.
Why selected/rejected for the role?
I've given all 4 round of interviews, but 3 round coding round i found little bit difficult compare to others
Preparation
Duration: 3 months
Topics: Data structures and algorithms,System design,DBMS,Operating system
Tip
Tip 1 : solve maximum coding questions
Tip 2 : develop

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import pandas as pd
import re

def preprocess_text(text: str) -> str:
    text = text.replace('\r\n', '\n').replace('\r', '\n')  # Normalize newlines

    headers = [
        r"(## Interview Preparation Journey)", r"(## Interview Rounds)",
        r"(### Round \d+)", r"(🔗 Problem Links:)",
        r"(Tip \d+:)", r"(Resume tip)", r"(Application story)", r"(Journey)",
        r"(Why selected/rejected for the role\?)"
    ]
    for h in headers:
        text = re.sub(h, r"\n\n\1", text)

    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)  # Merge lines within paragraphs
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def chunk_dataframe_descriptions(df: pd.DataFrame, text_col: str = "description") -> list[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    all_chunks = []

    for idx, row in df.iterrows():
        raw_text = row[text_col]
        if not isinstance(raw_text, str) or not raw_text.strip():
            continue

        preprocessed = preprocess_text(raw_text)
        chunks = splitter.create_documents([preprocessed], metadatas=[{"source_id": idx}])
        all_chunks.extend(chunks)

    return all_chunks



In [225]:
final_chunks = chunk_dataframe_descriptions(final_dataframe, text_col="description")

In [142]:
import pandas as pd
import re
from langchain.docstore.document import Document

def insert_intro_line(chunk: str) -> str:
    """
    Inserts an emphasis sentence after the ### Round X heading.
    """
    match = re.match(r'(### Round\s+(\d+))', chunk)
    if match:
        heading = match.group(1)
        round_num = match.group(2)
        extra_line = f"This paragraph contains information about Round {round_num}, including coding questions, duration, difficulty, and interview date.\n"
        return chunk.replace(heading, f"{heading}\n{extra_line}", 1)
    return chunk

# Function to split interview text into chunks
def split_interview_rounds(interview_text, source_id):
    """
    Splits interview text into preparation + rounds using '### Round' as marker.
    Returns list of dictionaries with page_content and metadata.
    """
    # Use regex to split before "### Round"
    chunks = re.split(r'(?=### Round)', interview_text.strip())

    docs = []

    # First chunk (before first round) = Preparation part
    intro_chunk = chunks[0].strip()
    if intro_chunk:
        docs.append({
            "page_content": intro_chunk,
            "metadata": {
                "source": source_id,
                "chunk_index": 0,
                "type": "interview_preparation"
            }
        })

    # Following chunks are the rounds
    for i, chunk in enumerate(chunks[1:], start=1):
        chunk = insert_intro_line(chunk.strip())
        if chunk:
            docs.append({
                "page_content": chunk,
                "metadata": {
                    "source": source_id,
                    "chunk_index": i,
                    "type": "interview_round",
                    "round_number": i
                }
            })

    return docs

all_docs = []

for idx, row in final_dataframe.iterrows():
    interview_text = row["description"]
    source_id = f"interview_{idx}"
    docs = split_interview_rounds(interview_text, source_id)
    all_docs.extend(docs)

NameError: name 'all_chunks' is not defined

In [143]:
print(final_dataframe['description'][0])

## Interview Preparation Journey
Interview preparation journey
Journey
While hunting for my first job, I kept track of every website that posted openings from top companies. In the process, I would apply to about 10-15 job openings a week. One of those was Walmart's, which I came across on Dare to Compete.
Application story
In the beginning, I applied through many online portals like Naukri, LinkedIn, and other sites, but I did not receive any response. Later, I found out that one of my friends works at Oracle, so I applied through his referral.
Why selected/rejected for the role?
I've given all 4 rounds of interviews, but 3rd round, the coding round, I found a little bit difficult compared to the others
Preparation
Duration: 3 months
Topics: Data structures and algorithms, System design, DBMS, Operating system
Tip
Tip 1 : solve maximum coding questions.
Tip 2 : develop problem-solving skills.
Tip 3 : do some projects.
Application process
Where: Campus
Eligibility: No
Resume tip
Tip 1 

In [144]:
chunks = chunk_dataframe_descriptions(final_dataframe, text_col="description")

for i in range(len(chunks)):
    print("this is chunk",i,chunks[i].page_content)

this is chunk 0 ## Interview Preparation 

Journey Interview preparation journey

Journey While hunting for my first job, I kept track of every website that posted openings from top companies. In the process, I would apply to about 10-15 job openings a week. One of those was Walmart's, which I came across on Dare to Compete.

Application story In the beginning, I applied through many online portals like Naukri, LinkedIn, and other sites, but I did not receive any response. Later, I found out that one of my friends works at Oracle, so I applied through his referral.

Why selected/rejected for the role? I've given all 4 rounds of interviews, but 3rd round, the coding round, I found a little bit difficult compared to the others Preparation Duration: 3 months Topics: Data structures and algorithms, System design, DBMS, Operating system Tip Tip 1 : solve maximum coding questions. Tip 2 : develop problem-solving skills. Tip 3 : do some projects. Application process Where: Campus Eligibility:

In [145]:
len(all_docs)

26

In [78]:
import pandas as pd

#
chunks = chunk_dataframe_descriptions(final_dataframe, text_col="description")

# Sample output
print(chunks[0].page_content)
print(chunks[0].metadata)


## Interview Preparation 

Journey Interview preparation journey

Journey While hunting for my first job, I kept track of every website that posted openings from top companies. In the process, I would apply to about 10-15 job openings a week. One of those was Walmart's, which I came across on Dare to Compete.

Application story In the beginning, I applied through many online portals like Naukri, LinkedIn, and other sites, but I did not receive any response. Later, I found out that one of my friends works at Oracle, so I applied through his referral.

Why selected/rejected for the role? I've given all 4 rounds of interviews, but 3rd round, the coding round, I found a little bit difficult compared to the others Preparation Duration: 3 months Topics: Data structures and algorithms, System design, DBMS, Operating system Tip Tip 1 : solve maximum coding questions. Tip 2 : develop problem-solving skills. Tip 3 : do some projects. Application process Where: Campus Eligibility: No
{'source_id'

In [47]:
result = final_dataframe['description'].str.cat(sep=' ')

In [49]:
result

'## Interview Preparation Journey\nInterview preparation journey\nJourney\nWhile hunting for my first job, I kept track of every website that posted openings from top companies. In the process, I would apply to about 10-15 job openings a week. One of those was Walmart\'s, which I came across on Dare to Compete.\nApplication story\nIn the beginning, I applied through many online portals like Naukri, LinkedIn, and other sites, but I did not receive any response. Later, I found out that one of my friends works at Oracle, so I applied through his referral.\nWhy selected/rejected for the role?\nI\'ve given all 4 rounds of interviews, but 3rd round, the coding round, I found a little bit difficult compared to the others\nPreparation\nDuration: 3 months\nTopics: Data structures and algorithms, System design, DBMS, Operating system\nTip\nTip 1 : solve maximum coding questions.\nTip 2 : develop problem-solving skills.\nTip 3 : do some projects.\nApplication process\nWhere: Campus\nEligibility: 

In [48]:
import re
import json
from collections import defaultdict

def clean_and_structure(raw_text: str):
    entries = []
    interviews = re.split(r'## Interview Preparation Journey', raw_text)
    
    for interview in interviews:
        if not interview.strip():
            continue
        
        data = defaultdict(lambda: None)
        # Extract application method
        match = re.search(r'Application process\nWhere: (.+)', interview)
        if match:
            data['application_method'] = match.group(1).strip()
        
        # Extract eligibility
        match = re.search(r'Eligibility: ([^\n]+)', interview)
        if match:
            data['eligibility'] = match.group(1).strip()

        # Extract preparation duration
        match = re.search(r'Preparation\nDuration: ([^\n]+)', interview)
        if match:
            data['preparation_duration'] = match.group(1).strip()

        # Extract preparation topics
        match = re.search(r'Topics: ([^\n]+)', interview)
        if match:
            data['topics'] = [topic.strip() for topic in match.group(1).split(',')]

        # Extract tips
        tips = re.findall(r'Tip \d+: (.+)', interview)
        if tips:
            data['tips'] = tips

        # Extract resume tips
        resume_tips = re.findall(r'Resume tip\n(?:Tip \d+: )?(.+?)(?=\n(?:Tip \d+:|$))', interview, flags=re.DOTALL)
        if resume_tips:
            data['resume_tips'] = [tip.strip().replace('\n', ' ') for tip in resume_tips]

        # Extract rounds
        rounds = []
        round_blocks = re.findall(r'### Round (\d+)(.+?)(?=### Round \d+|$)', interview, flags=re.DOTALL)
        for round_num, round_text in round_blocks:
            round_info = {
                'round_number': int(round_num),
                'mode': re.search(r'Mode[:\s]*([^\n]+)', round_text, re.IGNORECASE),
                'duration': re.search(r'Duration[:\s]*([^\n]+)', round_text, re.IGNORECASE),
                'type': None,
                'questions': []
            }

            # Parse questions inside each round
            questions = re.findall(r'\d+\.\s+(.+?)\n(?:Easy|Moderate|Hard)', round_text)
            difficulties = re.findall(r'\d+\.\s+.+?\n(Easy|Moderate|Hard)', round_text)
            approaches = re.findall(r'Problem approach\n(.+?)(?=\nSolve later|\n\d+\.\s|$)', round_text, re.DOTALL)

            for i in range(len(questions)):
                q = {
                    'title': questions[i].strip(),
                    'difficulty': difficulties[i].strip() if i < len(difficulties) else 'Unknown',
                    'approach': approaches[i].strip().replace('\n', ' ') if i < len(approaches) else ''
                }
                round_info['questions'].append(q)

            rounds.append(round_info)

        data['interview_rounds'] = rounds
        entries.append(dict(data))

    return entries




structured_data = clean_and_structure(result)
structured_data

[{'application_method': 'Campus',
  'eligibility': 'No',
  'preparation_duration': '3 months',
  'topics': ['Data structures and algorithms',
   'System design',
   'DBMS',
   'Operating system'],
  'interview_rounds': [{'round_number': 1,
    'mode': <re.Match object; span=(497, 505), match='Moderate'>,
    'duration': <re.Match object; span=(34, 53), match='Duration\n30 minutes'>,
    'type': None,
    'questions': [{'title': 'Reverse the String',
      'difficulty': 'Easy',
      'approach': '1. Initialize a variable to hold the maximum element and set it to the first element in the array.'},
     {'title': 'Sort Array', 'difficulty': 'Moderate', 'approach': ''}]},
   {'round_number': 2,
    'mode': None,
    'duration': <re.Match object; span=(36, 55), match='Duration\n40 minutes'>,
    'type': None,
    'questions': [{'title': 'Maximum Depth Of A Binary Tree',
      'difficulty': 'Easy',
      'approach': '1. Check if the root node is null, if so, return 0.'},
     {'title': 'Reve

In [5]:
result

'## Interview Preparation Journey\nInterview preparation journey\nJourney\nWhile hunting for my first job, I kept track of every website that posted openings from top companies. In the process, I would apply to about 10-15 job openings a week. One of those was Walmart\'s, which I came across on Dare to Compete.\nApplication story\nIn the beginning, I applied through many online portals like Naukri, LinkedIn, and other sites, but I did not receive any response. Later, I found out that one of my friends works at Oracle, so I applied through his referral.\nWhy selected/rejected for the role?\nI\'ve given all 4 rounds of interviews, but 3rd round, the coding round, I found a little bit difficult compared to the others\nPreparation\nDuration: 3 months\nTopics: Data structures and algorithms, System design, DBMS, Operating system\nTip\nTip 1 : solve maximum coding questions.\nTip 2 : develop problem-solving skills.\nTip 3 : do some projects.\nApplication process\nWhere: Campus\nEligibility: 

In [51]:
import json

def json_to_documents(json_data):
    documents = []
    for entry in json_data:
        doc = f"Application Method: {entry.get('application_method', '')}\n"
        doc += f"Eligibility: {entry.get('eligibility', '')}\n"
        doc += f"Preparation Duration: {entry.get('preparation_duration', '')}\n"
        doc += f"Topics: {', '.join(entry.get('topics', []))}\n"
        doc += "Tips:\n" + "\n".join(entry.get('tips', [])) + "\n"
        doc += "Resume Tips:\n" + "\n".join(entry.get('resume_tips', [])) + "\n"
        for round_info in entry.get('interview_rounds', []):
            doc += f"\nRound {round_info['round_number']}\n"
            for q in round_info['questions']:
                doc += f"Q: {q['title']} ({q['difficulty']})\nApproach: {q['approach']}\n"
        documents.append(doc.strip())
    return documents


In [None]:
results = json_to_documents(structured_data)

In [53]:
results

['Application Method: Campus\nEligibility: No\nPreparation Duration: 3 months\nTopics: Data structures and algorithms, System design, DBMS, Operating system\nTips:\n\nResume Tips:\n\n\nRound 1\nQ: Reverse the String (Easy)\nApproach: 1. Initialize a variable to hold the maximum element and set it to the first element in the array.\nQ: Sort Array (Moderate)\nApproach: \n\nRound 2\nQ: Maximum Depth Of A Binary Tree (Easy)\nApproach: 1. Check if the root node is null, if so, return 0.\nQ: Reverse Stack Using Recursion (Easy)\nApproach: \n\nRound 3',
 "Application Method: Campus\nEligibility: No\nPreparation Duration: 3 months\nTopics: Data structures and algorithms, System design, DBMS, Operating system\nTips:\ndon't put false things\nkeep ur resume short\nResume Tips:\ndon't put false things\n\nRound 1\nQ: Reverse Words In A String (Easy)\nApproach: 1. Create an empty string to hold the reversed string\nQ: Largest Element in the Array (Easy)\nApproach: 1. Initialize a variable to hold th

In [8]:
results

['Application Method: Campus\nEligibility: No\nPreparation Duration: 3 months\nTopics: Data structures and algorithms, System design, DBMS, Operating system\nTips:\n\nResume Tips:\n\n\nRound 1\nQ: Reverse the String (Easy)\nApproach: 1. Initialize a variable to hold the maximum element and set it to the first element in the array.\nQ: Sort Array (Moderate)\nApproach: \n\nRound 2\nQ: Maximum Depth Of A Binary Tree (Easy)\nApproach: 1. Check if the root node is null, if so, return 0.\nQ: Reverse Stack Using Recursion (Easy)\nApproach: \n\nRound 3',
 "Application Method: Campus\nEligibility: No\nPreparation Duration: 3 months\nTopics: Data structures and algorithms, System design, DBMS, Operating system\nTips:\ndon't put false things\nkeep ur resume short\nResume Tips:\ndon't put false things\n\nRound 1\nQ: Reverse Words In A String (Easy)\nApproach: 1. Create an empty string to hold the reversed string\nQ: Largest Element in the Array (Easy)\nApproach: 1. Initialize a variable to hold th

In [None]:
from langchain_core.documents import Document

documents = [Document(page_content=doc["page_content"], metadata=doc["metadata"]) for doc in all_docs]
documents

TypeError: 'Document' object is not subscriptable

In [147]:
from langchain_google_genai import ChatGoogleGenerativeAI

# Set your Gemini API key
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyC93UXPXnew9OdGP1nl2sE5v6dr1yZdzwM"

In [148]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [149]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-exp-03-07")


In [150]:
from langchain_community.vectorstores import FAISS

In [151]:
documents

[Document(metadata={'source': 'interview_0', 'chunk_index': 0, 'type': 'interview_preparation'}, page_content="## Interview Preparation Journey\nInterview preparation journey\nJourney\nWhile hunting for my first job, I kept track of every website that posted openings from top companies. In the process, I would apply to about 10-15 job openings a week. One of those was Walmart's, which I came across on Dare to Compete.\nApplication story\nIn the beginning, I applied through many online portals like Naukri, LinkedIn, and other sites, but I did not receive any response. Later, I found out that one of my friends works at Oracle, so I applied through his referral.\nWhy selected/rejected for the role?\nI've given all 4 rounds of interviews, but 3rd round, the coding round, I found a little bit difficult compared to the others\nPreparation\nDuration: 3 months\nTopics: Data structures and algorithms, System design, DBMS, Operating system\nTip\nTip 1 : solve maximum coding questions.\nTip 2 : d

In [227]:
vectorstore = FAISS.from_documents(
    chunks,
    GoogleGenerativeAIEmbeddings(model="embedding-001"),  # or your model name
)

In [228]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n1.", "\n2.", "\n", "."]
)

docs = splitter.split_documents(documents)


In [170]:
len(docs)

92

In [229]:
vectorstore.index_to_docstore_id

{0: 'ccffe890-a153-44ed-84d8-856013c156e7',
 1: '575f4f5a-b08a-45b7-9f85-4cb0a1f2166e',
 2: 'dc458992-2da0-4492-87b1-107fb7d9d43b',
 3: 'a5d74240-b5a0-4001-b1ce-0126a5fd83e6',
 4: 'e00bdc9e-3af7-47ff-990f-760e580e2842',
 5: '9be8aa52-03c4-4a72-9a38-a9eeaeafa648',
 6: 'fdb6e7ed-30b8-4d8e-bbda-87522c28af0c',
 7: '2616bfcc-cf74-4593-98c0-6e55d1161d32',
 8: '4f7ce36c-8c97-44db-a273-fd822da90a9d',
 9: '632e566d-3f61-4629-b3e5-4875152adf6b',
 10: '9cf06940-13bd-43dc-9c0c-80279b2e041e',
 11: '64d2dba2-8764-45bb-9a8b-d2a1fc4595eb',
 12: 'fea5a76a-93f7-4cdd-abc2-2f4b5849c6b2',
 13: '5bdc3368-0350-4394-99ef-b862532dfc66',
 14: '8b25acf2-11ee-4c7f-884b-81f7c22f855a',
 15: 'c5ebd7e3-d537-4ce9-b3bc-f75c7ef44afa',
 16: '251261d5-69b8-4bce-b2ee-c90e2500387d',
 17: '4f0b8aaf-3e44-4213-b74d-f95bcb0b461a',
 18: '0bcbedad-f200-436e-8ebf-6d61ac68c873',
 19: 'fa0844a7-1eb3-4992-ae48-3ee8271e8e3a',
 20: 'cc93a7ec-09c1-405a-87e0-8c8de91e74ae',
 21: 'a9c11da1-6d55-4d38-acd0-daf769a8cd4c',
 22: 'b57e285b-3dbf-

In [230]:
docs[0]

Document(metadata={'source': 'interview_0', 'chunk_index': 0, 'type': 'interview_preparation'}, page_content="## Interview Preparation Journey\nInterview preparation journey\nJourney\nWhile hunting for my first job, I kept track of every website that posted openings from top companies. In the process, I would apply to about 10-15 job openings a week. One of those was Walmart's, which I came across on Dare to Compete.\nApplication story")

In [231]:
retriever = vectorstore.as_retriever(search_type = 'similarity',search_kwargs = {"k":8})

In [184]:
import re

def extract_round_number(query: str):
    match = re.search(r'\bround\s*(\d+)', query.lower())
    return int(match.group(1)) if match else None

# Final retrieval function
def get_round_specific_docs(query):
    round_num = extract_round_number(query)
    if round_num is not None:
        return vectorstore.similarity_search(
            query,
            k=5,  # or whatever you prefer
            filter={"chunk_index": round_num}
        )
    else:
        # Fallback to regular semantic search
        return vectorstore.similarity_search(query, k=8)



In [232]:
retriever.invoke("give me list of all the coding questions")

[Document(id='2616bfcc-cf74-4593-98c0-6e55d1161d32', metadata={'source_id': 0}, page_content='🔗 Problem Links: null'),
 Document(id='4f0b8aaf-3e44-4213-b74d-f95bcb0b461a', metadata={'source_id': 1}, page_content='🔗 Problem Links: null\n\n### Round 4 04 Round Medium HR Round Duration 20 minutes Interview date 2 Mar 2023 Coding problem 1 It was last round hr round. 1. Basic HR Questions What inspired you to become a developer?\n\n🔗 Problem Links: null'),
 Document(id='4229ff81-4559-4177-af7b-bf48b061ed4c', metadata={'source_id': 5}, page_content=". Basically, it is Djikstra algorithm on matrix. Solve later Try solving now 2. SQL Question We have to write a query for the test case asked. It is a general query which one can practice from coding ninjas guided path for dbms. Problem approach Tip 1 : You should have good command on SQL. Tip 2 : Practice SQL queries (I'd recommend to do it from coding ninjas guided path for DBMS). 3. Rest API I don't exactly remember this question. But it was 

In [233]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=1.5,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [234]:
from langchain_core.prompts import PromptTemplate
prompt = PromptTemplate(
    template = """
You are a helpful assistant.

The transcript contains details of multiple interview rounds from one or more candidates for a job.
You will be given a context containing the interview details and a question about it.
Your task is to answer the question based on the provided context.
Be creative and provide a detailed answer.
assume all the interviews given by same person so dont bifurcate the answer on person level give me generalized answer.
dont use excessive * and use emojis
If the transcript lacks sufficient details, respond with:  
**"The information is not available in the provided transcript."**

use the following context only:
{context}

Question: {question}

Answer:
    
    """,
    input_variables = ['context', 'question']
)

In [235]:
question = "Give me list of questions asked in round 1"
retrieved_docs = get_round_specific_docs(question)

In [208]:
retrieved_docs

[Document(id='f6d1ecee-dfbe-4634-a400-ea809a435151', metadata={'source': 'interview_1', 'chunk_index': 1, 'type': 'interview_round', 'round_number': 1}, page_content="### Round 1\nThis paragraph contains information about Round 1, including coding questions, duration, difficulty, and interview date.\n\n01\nRound\nEasy\nOnline Coding Test\nDuration\n30 minutes\nInterview date\n1 Mar 2023\nCoding problem\n3\nThis is my 1st round, it was held on 1st march 2023, and it was contain only basic coding questions and it seems for me easy.\n1. Reverse Words In A String\nEasy\n10m average time\n90% success\n0/40\nAsked in companies\nYou are given a string 'str' of length 'N'.\n\nYour task is to reverse the original string word by word.\n\nView more\nProblem approach\n1. Create an empty string to hold the reversed string\n2. Loop through the input string from the end to the beginning\n3. For each character, append it to the empty string\n4. Return the reversed string\nSolve later\nTry solving now\

In [236]:
context_text = '\n\n'.join(doc.page_content for doc in retrieved_docs)

In [237]:
final_prompt = prompt.invoke({"context":context_text, "question":question})

In [238]:
answer = llm.invoke(final_prompt)
print(answer)

content='**The information is not available in the provided transcript.**😔' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []} id='run--35bf2a41-6bcc-4fbb-a275-e1c12093803b-0' usage_metadata={'input_tokens': 147, 'output_tokens': 13, 'total_tokens': 160, 'input_token_details': {'cache_read': 0}}


In [212]:
print(answer.content)

Okay, based on the provided interview transcripts, here's a breakdown of the coding questions asked in Round 1 across all the interviews:

**Coding Problems Asked in Round 1** 🚀

The difficulty level varied from Easy to Moderate. The interview dates range from September 2022 to March 2023. Here's a consolidated list:

*   **Reverse Words In A String** (Easy)
*   **Largest Element in the Array** (Easy)
*   **Factorial of a Number** (Moderate)
*   **Reverse the String** (Easy)
*   **Sort Array** (Moderate)
*   **Remove Duplicates from Sorted Array** (Easy)
*   **Longest Palindromic Substring** (Moderate)
*   **Sort Array of Strings** (Easy)

From the transcripts, most of the problems were standard algorithmic questions. 🌟


# BUILDING A CHAIN


In [213]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

In [214]:
def format_docs(retrieved_docs):
    context_text = '\n\n'.join(doc.page_content for doc in retrieved_docs)
    return context_text

In [215]:
parallel_chain = RunnableParallel({
    "context": retriever | RunnableLambda(format_docs),
    "question": RunnablePassthrough()
})

In [216]:
parallel_chain.invoke("what is the eligibility for the round 1?")

{'context': '## Interview Preparation Journey\nInterview preparation journey\nPreparation\nDuration: 6 month\nTopics: Data structures, Algorithms, Backtracking, Operating system\nTip\nTip 1 : Do read the concept and practice maximum\nTip 2 : Try to solve at least 2 code per day (If you\'re working in some job)\nApplication process\nWhere: Referral\nEligibility: As a experienced candidate, No criteria\nResume tip\nTip 1 : Do not put false things \nTip 2 : keep your resume short\n\n\n## Interview Rounds\n\n### Round 4\nThis paragraph contains information about Round 4, including coding questions, duration, difficulty, and interview date.\n\n04\nRound\nEasy\nHR Round\nDuration\n20 Minutes\nInterview date\n11 Sep 2022\nCoding problem\n1\n1. Basic HR Questions\nSalary discussion.\nWhat are your expectations.\nWhere do you see yourself in 5 years?\n\n🔗 Problem Links: null\n\n### Round 4\nThis paragraph contains information about Round 4, including coding questions, duration, difficulty, and 

In [217]:
parser = StrOutputParser()

In [218]:
main_chain = parallel_chain | prompt | llm | parser

In [219]:
output = main_chain.invoke("Give me list of questions asked roundwise")
output

"Alright, here's a breakdown of the interview questions asked in each round, based on the provided transcript. 🚀\n\n**Round 3**\n\n*   **OS Questions:**\n    *   What are semaphores? 🚦\n    *   What is virtual memory? 💾\n*   **System Design Question:**\n    *   Design a URL shortening service. 🔗\n*   **DBMS Question:**\n    *   Select all records from a table where a particular column value equals a given value. 🗄️\n    *   Define ACID properties. 🧪\n*   **Operating System Question:**\n    *   Implement a mutex lock using semaphores. 🔒\n\n**Round 4 (HR Round)**\n\n*   Basic HR Questions (across all round 4 instances):\n    *   Salary discussion. 💰\n    *   What are your expectations? 🤔\n    *   Where do you see yourself in 5 years? 🔮\n    *   What inspired you to become a developer? 💡\n*   **Project Discussions and General Conversations:**\n    *   Describe your internship project. 🏢\n    *   Describe your college project. 🎓\n*   **Behavioral Questions:**\n    *   Describe any event in

In [220]:
print(output)

Alright, here's a breakdown of the interview questions asked in each round, based on the provided transcript. 🚀

**Round 3**

*   **OS Questions:**
    *   What are semaphores? 🚦
    *   What is virtual memory? 💾
*   **System Design Question:**
    *   Design a URL shortening service. 🔗
*   **DBMS Question:**
    *   Select all records from a table where a particular column value equals a given value. 🗄️
    *   Define ACID properties. 🧪
*   **Operating System Question:**
    *   Implement a mutex lock using semaphores. 🔒

**Round 4 (HR Round)**

*   Basic HR Questions (across all round 4 instances):
    *   Salary discussion. 💰
    *   What are your expectations? 🤔
    *   Where do you see yourself in 5 years? 🔮
    *   What inspired you to become a developer? 💡
*   **Project Discussions and General Conversations:**
    *   Describe your internship project. 🏢
    *   Describe your college project. 🎓
*   **Behavioral Questions:**
    *   Describe any event in your life after which your