In [None]:
!pip install fpdf

In [None]:
!pip install groq

🔍 Web Scraper for GeeksforGeeks Interview Experiences
This script scrapes Adobe interview experiences from the GeeksforGeeks Experienced Interviews page. It performs the following:

1.Locates the section for the specified company (Adobe).

2.Extracts all interview links under that section.

3.Infers the candidate’s years of experience and maps it to a role (SDE-1, SDE-2, SDE-3).

4.Saves the data (Title, Link, Years, Role) into a CSV file named adobe_experiences.csv.










In [None]:
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import re
import csv

BASE_URL = "https://www.geeksforgeeks.org/interview-experiences/experienced-interview-experiences-company-wise/"
COMPANY = "Adobe"

def infer_role_and_years(title):
    m = re.search(r'(\d+(\.\d+)?)\s*(?:yr|year)', title, re.IGNORECASE)
    yrs = float(m.group(1)) if m else 0.0
    if yrs <= 2:
        role = "SDE-1"
    elif yrs <= 5:
        role = "SDE-2"
    else:
        role = "SDE-3"
    return yrs, role

def scrape_amazon_experiences():
    resp = requests.get(BASE_URL)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")

    # 1) Locate the exact text node "Amazon :"
    amazon_node = soup.find(string=re.compile(r'^\s*Adobe\s*:$'))
    if not amazon_node:
        print("❌ Could not locate the text node “Amazon :”.")
        return []

    entries = []
    # 2) Iterate through all following elements until the next "Label :"
    for elem in amazon_node.next_elements:
        # If we hit another label like "Aphonso :", we stop
        if isinstance(elem, NavigableString) and re.match(r'^\s*[A-Za-z0-9 &]+\s*:$', elem.strip()) \
           and elem.strip().lower() != "amazon:":
            break

        # Whenever we see an <a href="..."> that's an interview link, grab it
        if isinstance(elem, Tag) and elem.name == "a" and elem.get("href"):
            title = elem.get_text(strip=True)
            link  = elem["href"]
            yrs, role = infer_role_and_years(title)
            entries.append({
                "Company": COMPANY,
                "Title": title,
                "Link": link,
                "Years": yrs,
                "Role": role
            })

    return entries

def main():
    data = scrape_amazon_experiences()
    print(f"✅ Found {len(data)} entries for {COMPANY}.")  # expect around 61

    if not data:
        return

    # Write to CSV
    fname = f"{COMPANY.lower()}_experiences.csv"
    with open(fname, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["Company","Title","Link","Years","Role"])
        writer.writeheader()
        writer.writerows(data)

    print(f"✅ Written details to {fname}")

if __name__ == "__main__":
    main()


Here’s a concise summary for the second script, suitable for a Google Colab text cell:

---

### 📄 Extract Full Interview Experience Content from GFG Links

This script reads a list of Adobe interview experience links from `adobe_experiences.csv`, fetches each webpage, and extracts structured content:

* Uses `BeautifulSoup` to parse the interview article page.
* Identifies interview rounds based on `<strong>` tags (like “Round 1”, “Technical Round”, etc.).
* Preserves round titles and HTML formatting for better structure.
* Handles inconsistent HTML and adds fallback logic for different page layouts.
* Appends the extracted experience text to each row and writes the output to `adobe_experiences_full_text.csv`.
* Adds error handling and polite scraping via delays between requests.

At the end, it shows a preview of the first successfully extracted interview content.

---


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

def fetch_full_text(link):

    try:
        # Add headers to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        resp = requests.get(link, headers=headers, timeout=30)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")

        # Find the main content div
        text_div = soup.find("div", class_="text")
        if not text_div:
            # Fallback options for different page structures
            text_div = (soup.find("div", class_="entry-content") or
                       soup.find("article") or
                       soup.find("div", class_="content") or
                       soup.body)

        if not text_div:
            return "Content div not found"

        full_experience = []

        # Find all <strong> tags that likely mark interview rounds
        strong_tags = text_div.find_all('strong')

        if not strong_tags:
            # If no strong tags, return the entire text content (cleaned)
            clean_text = text_div.get_text(separator=' ', strip=True)
            # Remove extra whitespace
            clean_text = re.sub(r'\s+', ' ', clean_text)
            return clean_text

        # Process each round
        for i, strong in enumerate(strong_tags):
            round_title = strong.get_text(strip=True)

            # Enhanced round detection - look for round indicators
            round_keywords = ['round', 'interview', 'telephonic', 'f2f', 'phone', 'onsite',
                            'technical', 'hr', 'managerial', 'written', 'coding', 'design',
                            'screening', 'assessment', 'test']

            # Skip if it doesn't look like a round
            if not any(keyword in round_title.lower() for keyword in round_keywords):
                # But include if it looks like "Round 1", "Round 2", etc.
                if not re.match(r'.*round\s*\d+', round_title.lower()):
                    continue

            # Collect content until next strong tag or end
            content_parts = []
            current = strong.next_sibling

            while current:
                # Stop if we hit another round
                if current.name == 'strong':
                    next_strong_text = current.get_text(strip=True)
                    if any(keyword in next_strong_text.lower() for keyword in round_keywords):
                        break
                    # Also break on "Round X" patterns
                    if re.match(r'.*round\s*\d+', next_strong_text.lower()):
                        break

                if isinstance(current, str):
                    content_parts.append(current)
                else:
                    # Preserve HTML structure but clean it up
                    content_parts.append(str(current))

                current = current.next_sibling

            # Clean and format the content
            round_content = ''.join(content_parts).strip()

            if round_content:
                # Clean up extra whitespace and remove comments
                round_content = re.sub(r'\s+', ' ', round_content)
                round_content = re.sub(r'<!--.*?-->', '', round_content, flags=re.DOTALL)
                # Clean up HTML artifacts
                round_content = re.sub(r'</?div[^>]*>', '', round_content)
                round_content = round_content.strip()

                if round_content:  # Only add if there's actual content
                    full_experience.append(f"<h3>{round_title}</h3>\n{round_content}\n")

        result = '\n'.join(full_experience) if full_experience else text_div.get_text(separator=' ', strip=True)

        # Final cleanup
        result = re.sub(r'\n\s*\n\s*\n+', '\n\n', result)  # Remove excessive newlines
        return result.strip()

    except requests.RequestException as e:
        return f"Network error: {str(e)}"
    except Exception as e:
        return f"Parsing error: {str(e)}"

def main():
    INPUT_CSV = "adobe_experiences.csv"
    OUTPUT_CSV = "adobe_experiences_full_text.csv"

    rows = []
    processed_count = 0
    error_count = 0

    with open(INPUT_CSV, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        fieldnames = list(reader.fieldnames) + ["Interview_Experience"]

        for i, entry in enumerate(reader):
            link = entry.get('Link', '')
            title = entry.get('Title', '')

            print(f"Fetching {i+1}: {title[:50]}...")

            # Add delay to be respectful to the server
            if i > 0:
                time.sleep(2)  # 2 second delay between requests

            try:
                interview_content = fetch_full_text(link)
                if interview_content.startswith(('Network error:', 'Parsing error:')):
                    error_count += 1
                else:
                    processed_count += 1
            except Exception as e:
                print(f"Unexpected error processing {link}: {e}")
                interview_content = f"Unexpected error: {str(e)}"
                error_count += 1

            rows.append({**entry, 'Interview_Experience': interview_content})

    # Write to output CSV
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"\n✅ Processing complete!")
    print(f"   Total entries: {len(rows)}")
    print(f"   Successfully processed: {processed_count}")
    print(f"   Errors: {error_count}")
    print(f"   Output file: {OUTPUT_CSV}")

    # Show sample of what was extracted
    if rows:
        print(f"\nSample from first successful entry:")
        for row in rows:
            content = row.get('Interview_Experience', '')
            if not content.startswith(('Network error:', 'Parsing error:', 'Unexpected error:')):
                print(f"Title: {row.get('Title', 'N/A')}")
                print(f"Content preview (first 400 chars):")
                print(f"{content[:400]}...")
                break

if __name__ == '__main__':
    main()


## Summary for Google Colab Text Cell

- Loads interview data from a CSV file using pandas.
- Filters rows where the 'Role' column is exactly 'SDE-1' (case-insensitive, trims spaces).
- Saves filtered results to a new CSV file named `sde_one_only.csv`.
- Prints the count of SDE-1 entries and shows a preview of the filtered data

In [None]:
import pandas as pd

# Load the CSV file
file_path = "adobe_experiences_full_text.csv"  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Normalize and filter for Role == 'SDE-1'
filtered_df = df[df['Role'].str.strip().str.upper() == 'SDE-1']

# Save the filtered result to a new CSV file (optional)
filtered_df.to_csv("sde_one_only.csv", index=False)

# Print number of filtered rows and preview
print(f"Total entries with Role 'SDE-1': {len(filtered_df)}")
print(filtered_df.head())


- The script processes SDE-1 interview experiences from a CSV, summarizes each using the Groq API, and saves the results.
- It uses the **gemma2-9b-it** model for summarization because this model is optimized for instruction-following tasks, making it suitable for extracting structured, concise interview summaries from complex HTML content.
- The approach ensures coding questions with links are preserved in markdown format, and all key interview aspects (technical, coding, design, behavioral, structure) are covered.
- A 1.2-second delay is added between API calls for rate limiting.
- Summaries are saved to a new CSV file for further analysis or review

In [None]:
from groq import Groq
import pandas as pd
from bs4 import BeautifulSoup
import time

client = Groq(api_key="api-key")

def summarize_interview_experience(html_content):

    if pd.isnull(html_content):
        return ""

    # Keep both raw HTML (for links) and clean text
    soup = BeautifulSoup(html_content, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    prompt = f"""
You are given a software-engineering interview write-up in HTML. Extract and summarize it in 3–5 bullet points, covering:
- Technical questions asked
- Coding challenges (further divide these into topics: Array, Tree, String, DP, Graph)
- System-design components
- Behavioral questions
- Interview structure/rounds

**Important:** For every coding question, if the original HTML contained an `<a>` anchor, include it in markdown form `[Question text](URL)`.

---
**Interview write-up (plain text):**
{clean_text[:3000]}

---
**Interview write-up (raw HTML, for link extraction):**
{html_content[:3000]}
"""

    try:
        completion = client.chat.completions.create(
            model="gemma2-9b-it",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_completion_tokens=400
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error summarizing: {e}")
        return ""
# Load your data
df = pd.read_csv('sde_one_only.csv')

# Process interviews with rate limiting
summaries = []
for idx, row in df.iterrows():
    print(f"Processing row {idx+1}/{len(df)}")
    summary = summarize_interview_experience(row['Interview_Experience'])
    summaries.append(summary)
    time.sleep(1.2)  # Maintain 1.2s between requests

# Save results
df['Interview_Summary'] = summaries
df.to_csv('summarize_interviews.csv', index=False)
print("Saved summarize_interviews.csv")

This script aggregates and processes summarized SDE-1 interview data to create a structured PDF-style report. It uses the **deepseek-r1-distill-llama-70b** model through Groq's API, chosen for its capacity to handle complex technical content and generate well-structured markdown outputs required for professional documentation. Key aspects:

**Model Selection Rationale**  
- Optimized for technical content synthesis and instruction-following  
- Balances conciseness with detail retention for multi-section reports  
- Handles markdown formatting requirements natively  

**Workflow**  
1. Merges individual interview summaries into a single text corpus  
2. Uses LLM to extract patterns and organize content into:  
   - Technical topics (Array, Tree, String, DP, Graph)  
   - Behavioral questions with sample answers  
   - System design components  
   - Interview structure analysis  
3. Preserves coding challenge links in markdown format  
4. Exports both full report and question-link pairs as CSVs  

The temperature setting (0.2) ensures factual consistency while allowing some variability in advice formulation. This builds on previous data processing work with pandas and aligns with the user's goal of creating structured interview preparation materials.



In [None]:
import pandas as pd
import re
from groq import Groq

# Initialize Groq client with API key as a string
client = Groq(api_key="api-key")

# Load your data and merge summaries
df = pd.read_csv('summarize_interviews.csv')
merged_summary = '\n'.join(df['Interview_Summary'].astype(str).tolist())

# Enhanced prompt for SDE-1 interview PDF-style summary including request for markdown links
prompt = f"""
You are an expert technical interviewer and career coach.

Given a merged summary of several software engineering interview experiences, generate a detailed, well-structured summary report similar to a professional SDE-1 interview preparation PDF.

The summary should include:
- **Key technical topics** covered (Array, Tree, String, DP, Graph, System Design)
- **Common behavioral questions** and sample answers
- **Coding challenges** with examples and links if present (keep markdown links as `[Question text](URL)`)
- **System design questions** and key points
- **Interview structure and rounds** overview
- **Notable patterns, advice, and tips** for candidates

Format the output as markdown with clear headings, bullet points, and tables where appropriate. Ensure it is concise, professional, and shloud in pdf form.

---
Merged Interview Summaries:
{merged_summary[:6000]}
---
"""

def summarize_with_llm(text, prompt, max_tokens=1024):
    try:
        completion = client.chat.completions.create(
            model="deepseek-r1-distill-llama-70b",  # Or your preferred model
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_completion_tokens=max_tokens
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error summarizing: {e}")
        return ""

# Generate the summary
final_summary = summarize_with_llm(merged_summary, prompt)

# Extract all markdown links [text](url) from the final summary
pattern = r'\[([^\]]+)\]\((https?://[^\)]+)\)'
matches = re.findall(pattern, final_summary)

# Create a DataFrame of questions and links
questions_links_df = pd.DataFrame(matches, columns=['Coding Question', 'Link'])

# Save the table of questions and links
questions_links_df.to_csv('sde1_coding_questions_links.csv', index=False)

# Save the full summary as CSV (single cell) for markdown/PDF conversion
output_df = pd.DataFrame({'Summary': [final_summary]})
output_df.to_csv('sde1_final_summary.csv', index=False)

print("Final summary and coding question links saved successfully.")
print(final_summary)


In [None]:
!pip install weasyprint


- Loads the SDE-1 interview summary, cleans unwanted tags, and analyzes the frequency of key topics.
- Generates a pie chart showing topic distribution and embeds it in the report.
- Converts the markdown summary (with clickable links) to styled HTML.
- Combines the chart and summary into a polished PDF, preserving all links for easy access.
- Final output: a professional SDE-1 interview preparation PDF with visual analytics and structured content

In [None]:
import pandas as pd
import re
import markdown
from weasyprint import HTML
import matplotlib.pyplot as plt
import base64
from io import BytesIO

# Step 1: Load the summary
df = pd.read_csv("sde1_final_summary.csv")
summary_md = df.iloc[0]['Summary']

# Step 2: Remove all <think>...</think> sections (case-insensitive)
summary_md_cleaned = re.sub(r'<think>.*?</think>', '', summary_md, flags=re.DOTALL | re.IGNORECASE)

# Step 3: Analyze topic distribution
topics = ['Array', 'Tree', 'String', 'DP', 'Graph', 'System Design', 'Behavioral Questions']
topic_counts = {
    topic: len(re.findall(rf'\b{topic}\b', summary_md_cleaned, flags=re.IGNORECASE))
    for topic in topics
}
topic_counts = {k: v for k, v in topic_counts.items() if v > 0}

# Step 4: Create pie chart as base64 image
fig, ax = plt.subplots(figsize=(6, 6))
ax.pie(topic_counts.values(), labels=topic_counts.keys(), autopct='%1.1f%%', startangle=140)
ax.set_title("Topic Distribution in Interview Summary")
buf = BytesIO()
plt.savefig(buf, format='png', bbox_inches='tight')
plt.close(fig)
buf.seek(0)
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
img_html = f'<img src="data:image/png;base64,{img_base64}" alt="Topic Distribution Chart" style="max-width:100%; height:auto;">'

# Step 5: Convert markdown to HTML with clickable links
html_summary = markdown.markdown(
    summary_md_cleaned,
    extensions=['extra', 'tables', 'sane_lists']
)

# Step 6: Prepare final HTML with chart + summary
styled_html = f"""
<html>
<head>
    <meta charset="utf-8">
    <style>
        @page {{
            size: A4;
            margin: 1in;
        }}
        body {{
            font-family: 'Segoe UI', sans-serif;
            line-height: 1.6;
            padding: 0;
            font-size: 14px;
            color: #333;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
        }}
        table, th, td {{
            border: 1px solid #aaa;
        }}
        th, td {{
            padding: 8px;
            text-align: left;
        }}
        h1, h2, h3 {{
            color: #2c3e50;
        }}
        a {{
            color: #1e88e5;
            text-decoration: underline;
        }}
    </style>
</head>
<body>
    <h1>SDE-1 Interview Preparation Summary</h1>
    <h2>📊 Topic Distribution</h2>
    {img_html}
    <hr>
    {html_summary}
</body>
</html>
"""

# Step 7: Generate PDF with clickable links
HTML(string=styled_html, base_url='.').write_pdf("sde1_summary_report_with_chart.pdf")

print("✅ PDF with cleaned summary and clickable links saved as 'sde1_summary_report_with_chart.pdf'")


In [None]:
!pip install groq

In [None]:
!pip install python-dotenv
from dotenv import load_dotenv
load_dotenv()

This script implements a **synchronized pipeline** that integrates all previous code components into an automated end-to-end solution for generating company-specific SDE interview preparation materials. Key features:

**Automated Workflow**  
1. **Scrapes interview experiences** from GeeksforGeeks' company-wise listings  
2. **Infers roles** (SDE-1/2/3) using regex-based experience parsing  
3. **Summarizes content** using Groq's gemma2-9b-it (per-interview) and deepseek-r1-distill-llama-70b (aggregated) models  
4. **Generates PDF reports** with topic distribution charts and clickable question links  
5. **Preserves source links** while cleaning HTML artifacts  

**Final Outputs**  
- Individual interview summaries (CSV)  
- Aggregated markdown summary (CSV)  
- Coding question reference table (CSV)  
- Styled PDF report with visual analytics  

This represents the synthesized final product of the user's earlier data processing, summarization, and PDF generation code components, now unified into a single executable pipeline

In [None]:
import os
import time
import re
import pandas as pd
from io import BytesIO
import base64
import requests
from bs4 import BeautifulSoup, Tag, NavigableString
from groq import Groq
import matplotlib.pyplot as plt
from weasyprint import HTML
import markdown
from dotenv import load_dotenv
load_dotenv()
client = Groq(api_key=os.getenv("GROQ_API_KEY", "Groq_api_key"))
BASE_URL = "https://www.geeksforgeeks.org/interview-experiences/experienced-interview-experiences-company-wise/"

def infer_role_and_years(title):
    m = re.search(r'(\d+(?:\.\d+)?)\s*(?:yr|year)', title, re.IGNORECASE)
    yrs = float(m.group(1)) if m else 0.0
    if yrs <= 2: return yrs, 'SDE-1'
    elif yrs <= 5: return yrs, 'SDE-2'
    return yrs, 'SDE-3'

def scrape_company_experiences(company):
    soup = BeautifulSoup(requests.get(BASE_URL).text, "lxml")
    label_node = soup.find(string=re.compile(rf'^\s*{re.escape(company)}\s*:$'))
    if not label_node:
        print(f"❌ Could not locate section for '{company}'.")
        return []
    entries = []
    for elem in label_node.next_elements:
        if isinstance(elem, NavigableString) and re.match(r'^\s*[A-Za-z0-9 &]+\s*:$', elem.strip()) and elem.strip() != f"{company}:":
            break
        if isinstance(elem, Tag) and elem.name == "a" and elem.get("href"):
            title = elem.get_text(strip=True)
            link = elem["href"]
            yrs, role = infer_role_and_years(title)
            entries.append({"Company": company, "Title": title, "Link": link, "Years": yrs, "Role": role})
    return entries

def fetch_full_text(link):
    soup = BeautifulSoup(requests.get(link, headers={'User-Agent': 'Mozilla/5.0'}).text, 'html.parser')
    text_div = soup.find('div', class_='text') or soup.find('div', class_='entry-content') or soup.find('article') or soup.body
    return str(text_div) if text_div else ''

def summarize_single_experience(html_content):
    if pd.isnull(html_content): return ""
    soup = BeautifulSoup(html_content, 'html.parser')
    clean_text = soup.get_text(separator=' ', strip=True)

    prompt = f"""
You are given a software-engineering interview write-up in HTML. Extract and summarize it in 3–5 bullet points.

Cover:
- Technical questions asked (by topic: Array, Tree, String, DP, Graph)
- System design components
- Behavioral questions
- Interview structure/rounds

Preserve markdown links from `<a>` tags like `[Question](URL)`.

---
**Text:** {clean_text[:3000]}
---
**HTML:** {html_content[:3000]}
"""

    try:
        completion = client.chat.completions.create(
            model="gemma2-9b-it",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_completion_tokens=400
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"❌ Error summarizing: {e}")
        return ""

def generate_final_summary(merged_summary,role):
    prompt = f"""
You are an expert technical interviewer.

Given merged {role.upper()} summaries, generate a detailed report for a prep PDF.

Include:
- Key topics (Array, Tree, String, DP, Graph, System Design)
- Behavioral questions & sample answers
- Coding questions with links
- System design topics
- Interview structure
- Common advice/tips

Format professionally in markdown.

---
{merged_summary[:6000]}
---
"""
    try:
        completion = client.chat.completions.create(
            model="deepseek-r1-distill-llama-70b",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_completion_tokens=2048
        )
        return completion.choices[0].message.content
    except Exception as e:
        print(f"❌ Error finalizing summary: {e}")
        return ""

def export_cleaned_pdf(company, role, summary_md, output_path):
    # Remove <think>...</think>
    summary_md_cleaned = re.sub(r'<think>.*?</think>', '', summary_md, flags=re.DOTALL | re.IGNORECASE)

    # Analyze topic distribution
    topics = ['Array', 'Tree', 'String', 'DP', 'Graph', 'System Design', 'Behavioral Questions']
    topic_counts = {
        topic: len(re.findall(rf'\b{topic}\b', summary_md_cleaned, flags=re.IGNORECASE))
        for topic in topics
    }
    topic_counts = {k: v for k, v in topic_counts.items() if v > 0}

    # Pie chart
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.pie(topic_counts.values(), labels=topic_counts.keys(), autopct='%1.1f%%', startangle=140)
    ax.set_title("Topic Distribution in Interview Summary")
    buf = BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight')
    plt.close(fig)
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    img_html = f'<img src="data:image/png;base64,{img_base64}" alt="Topic Distribution Chart" style="max-width:100%; height:auto;">'

    # Convert markdown to HTML
    html_summary = markdown.markdown(summary_md_cleaned, extensions=['extra', 'tables', 'sane_lists'])

    # Final HTML with styles
    styled_html = f"""
    <html>
    <head>
        <meta charset="utf-8">
        <style>
            @page {{ size: A4; margin: 1in; }}
            body {{
                font-family: 'Segoe UI', sans-serif;
                font-size: 14px;
                color: #333;
                line-height: 1.6;
            }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ border: 1px solid #ccc; padding: 8px; }}
            h1, h2, h3 {{ color: #2c3e50; }}
            a {{ color: #1e88e5; text-decoration: underline; }}
        </style>
    </head>
    <body>
        <h1>{company} - {role} Interview Preparation Summary</h1>
        <h2>📊 Topic Distribution</h2>
        {img_html}
        <hr>
        {html_summary}
    </body>
    </html>
    """
    HTML(string=styled_html, base_url='.').write_pdf(output_path)
    print(f"✅ PDF created: {output_path}")

def orchestrate_full_pipeline():
    company = input("Enter Company name: ").strip()
    role = input("Enter Role (e.g. SDE-1): ").strip()

    print("🔍 Scraping...")
    entries = scrape_company_experiences(company)
    if not entries:
        return

    df = pd.DataFrame(entries)
    df = df[df['Role'].str.upper() == role.upper()]
    if df.empty:
        print(f"❌ No entries found for {role}")
        return

    print("📥 Fetching interview content...")
    df['Interview_Experience'] = [fetch_full_text(link) for link in df['Link']]

    print("🧠 Summarizing interviews...")
    df['Interview_Summary'] = [summarize_single_experience(html) for html in df['Interview_Experience']]
    df.to_csv(f"{company.lower()}_{role.lower()}_individual_summaries.csv", index=False)

    print("📄 Generating final markdown summary...")
    merged_summary = '\n'.join(df['Interview_Summary'].astype(str))
    final_summary = generate_final_summary(merged_summary,role)

    # Save to CSV
    summary_csv_path = f"{company.lower()}_{role.lower()}_final_summary.csv"
    pd.DataFrame({'Summary': [final_summary]}).to_csv(summary_csv_path, index=False)
    print(f"📝 Summary saved to {summary_csv_path}")

    # Extract coding question links
    print("🔗 Extracting coding links...")
    matches = re.findall(r'\[([^\]]+)\]\((https?://[^\)]+)\)', final_summary)
    pd.DataFrame(matches, columns=['Question', 'Link']).to_csv(f"{company.lower()}_coding_questions.csv", index=False)

    # PDF with styled HTML and chart
    print("📄 Building PDF...")
    export_cleaned_pdf(company, role, final_summary, f"{company.lower()}_{role.lower()}_summary.pdf")

if __name__ == "__main__":
    orchestrate_full_pipeline()


In [None]:
import re
import base64
import matplotlib.pyplot as plt
from io import BytesIO
from weasyprint import HTML
import markdown

def export_cleaned_pdf(company, role, summary_md, output_path):
    # Remove <think>...</think> blocks
    summary_md_cleaned = re.sub(r'<think>.*?</think>', '', summary_md, flags=re.DOTALL | re.IGNORECASE)

    # Analyze topic distribution
    topics = ['Array', 'Tree', 'String', 'DP', 'Graph', 'System Design', 'Behavioral Questions']
    topic_counts = {
        topic: len(re.findall(rf'\b{topic}\b', summary_md_cleaned, flags=re.IGNORECASE))
        for topic in topics
    }
    topic_counts = {k: v for k, v in topic_counts.items() if v > 0}

    # Pie chart of topics
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.pie(topic_counts.values(), labels=topic_counts.keys(), autopct='%1.1f%%', startangle=140)
    ax.set_title("Topic Distribution in Interview Summary")
    buf = BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight')
    plt.close(fig)
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    img_html = f'<img src="data:image/png;base64,{img_base64}" alt="Topic Distribution Chart" style="max-width:100%; height:auto;">'

    # Convert markdown to HTML
    html_summary = markdown.markdown(summary_md_cleaned, extensions=['extra', 'tables', 'sane_lists'])

    # Final styled HTML for PDF
    styled_html = f"""
    <html>
    <head>
        <meta charset="utf-8">
        <style>
            @page {{ size: A4; margin: 1in; }}
            body {{
                font-family: 'Segoe UI', sans-serif;
                font-size: 14px;
                color: #333;
                line-height: 1.6;
            }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ border: 1px solid #ccc; padding: 8px; }}
            h1, h2, h3 {{ color: #2c3e50; }}
            a {{ color: #1e88e5; text-decoration: underline; }}
        </style>
    </head>
    <body>
        <h1>{company} - {role} Interview Preparation Summary</h1>
        <h2>📊 Topic Distribution</h2>
        {img_html}
        <hr>
        {html_summary}
    </body>
    </html>
    """
    HTML(string=styled_html, base_url='.').write_pdf(output_path)
    print(f"✅ PDF created: {output_path}")

Matplotlib is building the font cache; this may take a moment.


In [None]:
summary_md = 