In [None]:
!pip install pdfplumber requests google-generativeai pytesseract pillow

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   - -------------------------------------- 0.3/5.6 MB ? eta -:--:--
   ------- -------------------------------- 1.0/5.6 MB 3.9 MB/s eta 0:00:02
   ---------------------- ----------------- 3.1/5.6 MB 6.6 MB/s eta 0:00:01
   ------------------------------------- -- 5.2/5.6 MB 7.6 MB/s eta 0:00:01
   ---------------------------------------- 5.6/5.6 MB 7.4 MB/s eta 0:00:00
Downloading pytesseract-

Import Libraries

In [3]:
# Import pdfplumber for extracting text from PDF files
import pdfplumber
# Import requests for fetching PDFs from URLs
import requests
# Import Google Gemini AI for pitch deck analysis
import google.generativeai as genai
# Import json for handling JSON data from API responses
import json
# Import BytesIO for handling binary data like PDFs in memory
from io import BytesIO
# Import pytesseract for OCR text extraction from images
import pytesseract
# Import PIL's Image module for image processing in OCR
from PIL import Image
# Import re for regular expression text preprocessing
import re
# Import os for file operations (used for saving results)
import os

print("Libraries imported successfully.")

Libraries imported successfully.


Configure Gemini API and Define Constants

In [4]:
# Configure Gemini API with your API key
genai.configure(api_key="AIzaSyDwLiS2uHId79Lhn2mwdr7dhNHZXYoHZl0")  # Replace with your actual Gemini API key
# Define the specific Gemini model to use for text generation
GEMINI_MODEL = "models/gemini-1.5-flash-001-tuning"

# Define default weights for each pitch deck section for scoring
DEFAULT_WEIGHTS = {
    "Problem": 0.15, "Solution": 0.20, "Market": 0.20,
    "Business Model": 0.10, "Financials": 0.10, "Team": 0.25
}
# List of pitch deck sections to analyze
SECTIONS = list(DEFAULT_WEIGHTS.keys())
# Define evaluation criteria for each section for Gemini analysis
CRITERIA = {
    "Problem": "Clarity of the issue, significance, urgency of need",
    "Solution": "Clarity, effectiveness, uniqueness, feasibility",
    "Market": "Market size, growth potential, target audience definition, competitive landscape",
    "Business Model": "Revenue streams, pricing clarity, scalability, sales strategy",
    "Financials": "Realism of projections, funding needs, use of funds, financial clarity",
    "Team": "Relevant experience, skills, track record, team cohesion"
}
# Create a copy of weights to use in analysis (can be modified if needed)
WEIGHTS = DEFAULT_WEIGHTS.copy()

print("Gemini API configured and constants defined.")

Gemini API configured and constants defined.


Text Preprocessing Function

In [5]:
# Define a function to preprocess and clean extracted text
def preprocess_text(text):
    # Remove extra whitespace and normalize spaces to single spaces
    text = re.sub(r'\s+', ' ', text.strip())
    # Remove special characters except letters, numbers, spaces, periods, commas, and hyphens
    text = re.sub(r'[^\w\s.,-]', '', text)
    # Remove page numbers (e.g., "Page 1") from the text
    text = re.sub(r'Page \d+', '', text)
    # Remove leading numbers from each line (e.g., "1. Text" becomes "Text")
    text = re.sub(r'^\d+\s+', '', text, flags=re.MULTILINE)
    # Return the cleaned text
    return text

PDF Text Extraction Function

In [6]:
# Define a function to extract text from a PDF file
def extract_pdf_text(pdf_file):
    try:
        # Open the PDF file using pdfplumber for text extraction
        with pdfplumber.open(pdf_file) as pdf:
            # Extract text from each page and join with newlines
            text = "\n".join(page.extract_text() or "" for page in pdf.pages)
        # If no text is extracted, use OCR as a fallback
        if not text.strip():
            # Notify the user that OCR is being attempted
            print("No text extracted via parsing. Trying OCR...")
            text = ""
            # Reopen the PDF to process each page as an image
            for page in pdfplumber.open(pdf_file).pages:
                # Convert the page to a high-resolution image
                img = page.to_image(resolution=300).original
                # Extract text from the image using pytesseract
                text += pytesseract.image_to_string(img) + " "
            # Remove trailing whitespace from OCR-extracted text
            text = text.strip()
        # Preprocess and return the extracted text if present
        return preprocess_text(text) if text else None
    except Exception as e:
        # Print an error if text extraction fails
        print(f"Text extraction failed: {e}")
        # Return None to indicate failure
        return None

Fetch PDF from URL Function

In [7]:
# Define a function to fetch a PDF from a URL and extract its text
def fetch_pdf_from_url(url):
    # Convert Google Drive URLs to direct download links
    if "drive.google.com/file/d/" in url and "/view" in url:
        file_id = url.split("/d/")[1].split("/")[0]
        url = f"https://drive.google.com/uc?export=download&id={file_id}"
    # Convert GitHub URLs to raw content links
    elif "github.com" in url and "/blob/" in url:
        url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
    
    try:
        # Fetch the PDF content from the URL with a timeout
        response = requests.get(url, timeout=10, stream=True)
        # Raise an exception if the request fails
        response.raise_for_status()
        # Store the PDF content in a BytesIO object
        pdf_file = BytesIO(response.content)
        # Extract text from the fetched PDF
        text = extract_pdf_text(pdf_file)
        # Return the extracted text
        return text
    except requests.RequestException as e:
        # Print an error if fetching the PDF fails
        print(f"Failed to fetch PDF: {e}")
        # Return None to indicate failure
        return None

Extract Text from Uploaded File Function

In [8]:
# Define a function to extract text from a local PDF file
def extract_text_from_uploaded_file(file_path):
    # Open the file in binary read mode and convert to BytesIO
    with open(file_path, 'rb') as f:
        pdf_file = BytesIO(f.read())
    # Extract and return text from the PDF
    return extract_pdf_text(pdf_file)

Section Extraction Function (Gemini API)

In [9]:
# Define a function to extract pitch deck sections using Gemini API
def extract_sections_gemini(text):
    # Create a prompt instructing Gemini to extract specific sections
    prompt = (
        f"Extract text for these sections from the pitch deck: {', '.join(SECTIONS)}.\n"
        "Return a valid JSON object in this format:\n"
        "```json\n"
        "{\n" + "\n".join([f'  "{section}": "<Extracted text or Missing>"' for section in SECTIONS]) + "\n"
        "}\n"
        "```\n"
        "Pitch Deck Content:\n" + text
    )
    try:
        # Initialize the Gemini model for content generation
        model = genai.GenerativeModel(GEMINI_MODEL)
        # Generate a response based on the prompt
        response = model.generate_content(prompt)
        # Extract JSON from the response using regex
        json_match = re.search(r'```json\n(.*?)\n```', response.text.strip(), re.DOTALL)
        # Parse and return the JSON if found, otherwise return "Missing" for all sections
        return json.loads(json_match.group(1)) if json_match else {section: "Missing" for section in SECTIONS}
    except Exception as e:
        # Print an error if section extraction fails
        print(f"Section extraction failed: {e}")
        # Return a default dictionary with "Missing" for all sections
        return {section: "Missing" for section in SECTIONS}

Section Evaluation Function (Gemini API)

In [10]:
# Define a function to evaluate a pitch deck section using Gemini API
def evaluate_section_gemini(section_name, section_text):
    # Handle missing sections with a default score and feedback
    if section_text == "Missing":
        return 0, "This section is missing. Consider adding it to strengthen your pitch."
    
    # Create a prompt instructing Gemini to evaluate the section
    prompt = (
        f"Evaluate the '{section_name}' section based on: {CRITERIA[section_name]}.\n"
        "Return a valid JSON object in this format:\n"
        "```json\n"
        "{\n"
        '  "score": <integer between 0-10>,\n'
        '  "strengths": "<Key strengths>",\n'
        '  "weaknesses": "<Areas needing improvement>",\n'
        '  "suggestions": "<Content improvements or additional data needed>"\n'
        "}\n"
        "```\n"
        "Section Content:\n" + section_text
    )
    try:
        # Initialize the Gemini model for content generation
        model = genai.GenerativeModel(GEMINI_MODEL)
        # Generate a response based on the prompt
        response = model.generate_content(prompt)
        # Extract JSON from the response using regex
        json_match = re.search(r'```json\n(.*?)\n```', response.text.strip(), re.DOTALL)
        if json_match:
            # Parse the JSON response into a dictionary
            result = json.loads(json_match.group(1))
            # Format feedback string with strengths, weaknesses, and suggestions
            feedback = f"Strengths: {result.get('strengths', 'N/A')}\nWeaknesses: {result.get('weaknesses', 'N/A')}\nSuggestions: {result.get('suggestions', 'N/A')}"
            # Return the score and formatted feedback
            return result.get("score", 0), feedback
        # Return default values if JSON is not found
        return 0, "Evaluation failed: Invalid response format."
    except Exception as e:
        # Print an error if evaluation fails
        print(f"Evaluation failed for {section_name}: {e}")
        # Return default score and feedback on failure
        return 0, "Evaluation failed due to API error."

Pitch Deck Analysis Function

In [11]:
# Define a function to analyze the entire pitch deck
def analyze_pitch_deck(text):
    # Return None values if no text is provided
    if not text:
        return None, None, None
    # Extract sections from the pitch deck text
    section_texts = extract_sections_gemini(text)
    # Initialize dictionaries to store scores and feedback
    section_scores, feedbacks = {}, {}
    # Iterate over each section to evaluate it
    for section in SECTIONS:
        # Evaluate the section and store its score and feedback
        score, feedback = evaluate_section_gemini(section, section_texts.get(section, "Missing"))
        section_scores[section] = score
        feedbacks[section] = feedback
    # Calculate total score as a weighted sum, scaled to 100
    total_score = sum(section_scores[section] / 10 * WEIGHTS[section] for section in SECTIONS) * 100
    # Return the total score, section scores, and feedbacks
    return total_score, section_scores, feedbacks

Execute Analysis (Local File Example)

In [12]:
# Example: Analyze a local PDF file
# Replace with the path to your pitch deck PDF
pdf_path = "C:/Users/bumba/work/task/AI_Powered_Financial_Management.pdf"
# Extract text from the PDF
text = extract_text_from_uploaded_file(pdf_path)
if text:
    # Analyze the pitch deck
    total_score, section_scores, feedbacks = analyze_pitch_deck(text)
    if total_score is not None:
        # Print the total pitch score
        print(f"Pitch Score: {total_score:.2f}/100")
        # Print detailed feedback for each section
        for section in SECTIONS:
            print(f"\n{section}:")
            print(f"Score: {section_scores[section]}/10")
            print(f"Feedback:\n{feedbacks[section]}")
    else:
        print("Analysis failed.")
else:
    print("Failed to extract text from PDF.")

Pitch Score: 65.00/100

Problem:
Score: 7/10
Feedback:
Strengths: Clearly identifies the problem (financial management challenges for small businesses), provides context with examples (cash flow issues, unexpected expenses, poor planning), and highlights the limitations of traditional tools (complexity, cost, expertise needed).
Weaknesses: The section lacks specific data to quantify the extent of the problem. While it mentions challenges, it doesn't provide concrete figures on how many small businesses struggle, the financial impact, or the percentage who lack the necessary expertise.
Suggestions: Include statistics or data points to illustrate the magnitude of the problem. For example, mention the percentage of small businesses that fail due to financial mismanagement or the average financial loss incurred by small businesses due to poor financial planning. Also, consider mentioning specific examples of the difficulties small businesses face in using traditional accounting tools (e.g.

Save Results to File

In [13]:
# Optional: Save analysis results to a text file
if total_score is not None:
    # Create a formatted string for the results
    output_content = f"Pitch Deck Analysis Results\n\n"
    output_content += f"Total Pitch Score: {total_score:.2f}/100\n\n"
    output_content += "Detailed Feedback:\n"
    for section in SECTIONS:
        output_content += f"\n{section}:\n"
        output_content += f"Score: {section_scores[section]}/10\n"
        output_content += f"{feedbacks[section]}\n"
        output_content += "-" * 50 + "\n"
    
    # Write the results to a file
    with open("pitch_deck_analysis_results.txt", "w") as f:
        f.write(output_content)
    print("Results saved to 'pitch_deck_analysis_results.txt'")

Results saved to 'pitch_deck_analysis_results.txt'
