# for slanting text

In [72]:
import fitz  # PyMuPDF
import json
import math

def extract_pdf_data(pdf_path, output_json):
    doc = fitz.open(pdf_path)
    extracted_data = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_data = {
            "page_number": page_num + 1,
            "width": page.rect.width,
            "height": page.rect.height,
            "text_blocks": []
        }

        text_instances = page.get_text("dict")
        
        for block in text_instances["blocks"]:
            block_data = {"lines": []}
            
            for line in block["lines"]:
                line_data = {"spans": []}
                
                # Extract the direction vector (cosine, sine)
                direction = line.get("dir", (1, 0))
                cosine, sine = direction

                # Calculate the rotation angle in radians
                angle_rad = math.atan2(sine, cosine)

                # Convert the angle to degrees
                #angle_deg = math.degrees(angle_rad)
                
                
                
                angle_deg = -math.degrees(angle_rad)  # Invert the angle


                for span in line["spans"]:
                    span_data = {
                        "text": span["text"],
                        "font": span["font"],
                        "font_size": span["size"],
                        "bbox": span["bbox"],  # (x0, y0, x1, y1)
                        "rotation": angle_deg,  # Rotation angle in degrees
                        "color": span["color"]
                    }
                    line_data["spans"].append(span_data)
                
                block_data["lines"].append(line_data)
            
            page_data["text_blocks"].append(block_data)
        
        extracted_data.append(page_data)

    # Save to JSON file
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=4)

    print(f"Data extracted and saved to {output_json}")

# Run extraction
extract_pdf_data("Comic _Ghosts Book 1 Excerpt-6.pdf", "extracted_data.json")


Data extracted and saved to extracted_data.json


# without slant

##### the following is the best font processing part

In [89]:
import fitz  # PyMuPDF
import json
import math

def extract_pdf_data(pdf_path, output_json):
    doc = fitz.open(pdf_path)
    extracted_data = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_data = {
            "page_number": page_num + 1,
            "width": page.rect.width,
            "height": page.rect.height,
            "text_blocks": []
        }

        text_instances = page.get_text("dict")

        for block in text_instances["blocks"]:
            block_data = {"lines": []}

            # ✅ Check if "lines" key exists before accessing
            if "lines" not in block:
                continue  # Skip blocks that don't contain text lines

            for line in block["lines"]:
                line_data = {"spans": []}

                for span in line["spans"]:
                    # ✅ Extract text rotation and invert the slant
                    if "dir" in span:  # Check if direction vector exists
                        cosine, sine = span["dir"]
                        angle_rad = math.atan2(sine, cosine)
                        angle_deg = -math.degrees(angle_rad)  # Inverted angle
                    else:
                        angle_deg = 0  # Default to 0 if no direction found

                    span_data = {
                        "text": span["text"],
                        "font": span["font"],
                        "font_size": span["size"]-0.5,
                        "bbox": span["bbox"],  # (x0, y0, x1, y1)
                        "rotation": angle_deg,  # Use inverted angle
                        "color": span["color"]
                    }
                    line_data["spans"].append(span_data)

                block_data["lines"].append(line_data)

            page_data["text_blocks"].append(block_data)

        extracted_data.append(page_data)

    # Save to JSON file
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=4)

    print(f"✅ Data extracted and saved to {output_json}")

# Run extraction
#extract_pdf_data("SBI Innovative Opportunities Fund_One Pager (2) (2).pdf", "extracted_data.json")
extract_pdf_data("Aatmanirbhar Org.pdf", "extracted_data.json")


✅ Data extracted and saved to extracted_data.json


#### extracting font color and determining which text is bold or not

In [96]:
import fitz  # PyMuPDF
import json
import math

def extract_pdf_data(pdf_path, output_json):
    doc = fitz.open(pdf_path)
    extracted_data = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_data = {
            "page_number": page_num + 1,
            "width": page.rect.width,
            "height": page.rect.height,
            "text_blocks": []
        }

        text_instances = page.get_text("dict")

        for block in text_instances["blocks"]:
            block_data = {"lines": []}

            # ✅ Check if "lines" key exists before accessing
            if "lines" not in block:
                continue  # Skip non-text blocks (like images)

            for line in block["lines"]:
                line_data = {"spans": []}

                for span in line["spans"]:
                    # ✅ Extract text rotation and invert the slant
                    if "dir" in span:  # Check if direction vector exists
                        cosine, sine = span["dir"]
                        angle_rad = math.atan2(sine, cosine)
                        angle_deg = -math.degrees(angle_rad)  # Inverted angle
                    else:
                        angle_deg = 0  # Default to 0 if no direction found

                    # ✅ Extract text color and convert to RGB
                    color_int = span["color"]
                    r = (color_int >> 16) & 0xFF  # Extract red
                    g = (color_int >> 8) & 0xFF   # Extract green
                    b = color_int & 0xFF          # Extract blue
                    text_color = (r, g, b)        # Store as RGB tuple

                    # ✅ Check if text is bold
                    is_bold = "bold" in span["font"].lower()  # Case-insensitive check

                    span_data = {
                        "text": span["text"],
                        "font": span["font"],
                        "font_size": span["size"]-0.5,
                        "bbox": span["bbox"],  # (x0, y0, x1, y1)
                        "rotation": angle_deg,  # Use inverted angle
                        "color": text_color,  # RGB format
                        "bold": is_bold  # Boolean (True if bold, else False)
                    }
                    line_data["spans"].append(span_data)

                block_data["lines"].append(line_data)

            page_data["text_blocks"].append(block_data)

        extracted_data.append(page_data)

    # Save to JSON file
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=4)

    print(f"✅ Data extracted and saved to {output_json}")

# Run extraction
#extract_pdf_data("SBI Innovative Opportunities Fund_One Pager (2) (2).pdf", "extracted_data.json")
extract_pdf_data("Aatmanirbhar Org.pdf", "extracted_data.json")


✅ Data extracted and saved to extracted_data.json


# Using  PyMyPDF

# reconstruction of pdf

In [19]:
# import fitz  # PyMuPDF
# import json

# def reconstruct_pdf_from_json(json_path, output_pdf):
#     with open(json_path, "r", encoding="utf-8") as f:
#         data = json.load(f)

#     doc = fitz.open()

#     for page_data in data:
#         page = doc.new_page(width=page_data["width"], height=page_data["height"])

#         for text_block in page_data["text_blocks"]:
#             for line in text_block["lines"]:
#                 for span in line["spans"]:
#                     text = span["text"]
#                     bbox = span["bbox"]  # [x0, y0, x1, y1]
#                     font_size = span["font_size"]
#                     font_name = span.get("font", "helv")  # Default to Helvetica if font not found
#                     rotation = span.get("rotation", 30)

#                     x0, y0, x1, y1 = bbox

#                     # Calculate the bottom-left corner for the text insertion
#                     insertion_point = fitz.Point(x0, y1)

#                     # Insert text with rotation
#                     page.insert_text(
#                         insertion_point,
#                         text,
#                         #fontsize=font_size,
#                         #fontname=font_name,
#                         rotate=rotation
#                     )

#     doc.save(output_pdf)
#     print(f"✅ Reconstructed PDF saved as {output_pdf}")

# # Example usage
# reconstruct_pdf_from_json("extracted_data.json", "reconstructed.pdf")


✅ Reconstructed PDF saved as reconstructed.pdf


In [24]:
# import fitz  # PyMuPDF
# import json

# def reconstruct_pdf_from_json(json_path, output_pdf):
#     with open(json_path, "r", encoding="utf-8") as f:
#         data = json.load(f)

#     doc = fitz.open()

#     for page_data in data:
#         page = doc.new_page(width=page_data["width"], height=page_data["height"])

#         for text_block in page_data["text_blocks"]:
#             for line in text_block["lines"]:
#                 for span in line["spans"]:
#                     text = span["text"]
#                     bbox = span["bbox"]  # [x0, y0, x1, y1]
#                     font_size = span["font_size"]
#                     font_name = span.get("font", "helv")  # Default to Helvetica if font not found

#                     x0, y0, x1, y1 = bbox

#                     # Calculate the insertion point (bottom-left corner of the bounding box)
#                     insertion_point = fitz.Point(x0, y1)

#                     # Create a rotation matrix for 45 degrees
#                     rotation_matrix = fitz.Matrix(45)

#                     # Insert text with 45-degree rotation using the morph parameter
#                     page.insert_text(
#                         insertion_point,
#                         text,
#                         #fontsize=font_size,
#                         #fontname=font_name,
#                         morph=(insertion_point, rotation_matrix)
#                     )

#     doc.save(output_pdf)
#     print(f"✅ Reconstructed PDF saved as {output_pdf}")

# # Example usage
# reconstruct_pdf_from_json("extracted_data.json", "reconstructed.pdf")


✅ Reconstructed PDF saved as reconstructed.pdf


In [78]:
import fitz  # PyMuPDF
import json

def reconstruct_pdf_from_json(json_path, output_pdf):
    # Load extracted data from JSON
    with open(json_path, "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    # Create a new PDF document
    doc = fitz.open()

    # Iterate through each page's data
    for page_data in extracted_data:
        # Create a new page with the same dimensions
        page = doc.new_page(width=page_data["width"], height=page_data["height"])

        # Iterate through text blocks
        for block in page_data["text_blocks"]:
            # Iterate through lines
            for line in block["lines"]:
                # Iterate through spans
                for span in line["spans"]:
                    text = span["text"]
                    font_name = span["font"]
                    font_size = span["font_size"]
                    x0, y0, x1, y1 = span["bbox"]
                    rotation = span.get("rotation", 0)

                    # Calculate the insertion point (bottom-left corner of the bbox)
                    insert_point = fitz.Point(x0, y1)

                    # Create a transformation matrix for rotation around the insertion point
                    transform_matrix = fitz.Matrix(rotation)

                    # Insert the text with the transformation
                    #page.insert_text(insert_point, text, fontsize=font_size, fontname=font_name, morph=(insert_point, transform_matrix))

                    page.insert_text(
                        insert_point,
                        text,
                        fontsize=font_size,
                        #fontname=font_name,
                        morph=(insert_point, transform_matrix)
                    )
    # Save the reconstructed PDF
    doc.save(output_pdf)
    print(f"✅ Reconstructed PDF saved as {output_pdf}")

# Example usage
reconstruct_pdf_from_json("extracted_data.json", "reconstructed.pdf")


✅ Reconstructed PDF saved as reconstructed.pdf


# Extracting image boxes and structure

## preserve struct a little better sol

In [81]:
import fitz  # PyMuPDF

def remove_text_from_pdf(input_pdf, output_pdf):
    # Open the PDF
    doc = fitz.open(input_pdf)
    
    # Iterate through each page
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Get all text blocks on the page
        text_instances = page.get_text("dict")
        
        # Iterate through each block
        for block in text_instances["blocks"]:
            if "lines" in block:  # This block contains text
                # Get the bounding box of the text block
                rect = fitz.Rect(block["bbox"])
                
                # Create a redaction annotation over the text block
                page.add_redact_annot(rect)
        
        # Apply the redactions
        page.apply_redactions()
    
    # Save the modified PDF
    doc.save(output_pdf)
    print(f"✅ Text removed. New PDF saved as: {output_pdf}")

# Example usage
#remove_text_from_pdf("Comic _Ghosts Book 1 Excerpt-6.pdf", "comic_no_text.pdf")
#remove_text_from_pdf("SBI Innovative Opportunities Fund_One Pager (2) (2).pdf", "sbi_output_no_text.pdf")
remove_text_from_pdf("Aatmanirbhar Org.pdf", "sbi_output_no_text.pdf")

✅ Text removed. New PDF saved as: sbi_output_no_text.pdf


# mapping 

In [90]:
import fitz  # PyMuPDF
import json

def overlay_text_on_pdf(input_pdf, json_path, output_pdf):
    # Load extracted text data
    with open(json_path, "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    # Open the existing PDF (output_no_text.pdf)
    doc = fitz.open(input_pdf)

    for page_index, page_data in enumerate(extracted_data):
        if page_index >= len(doc):  # Ensure we don't go out of bounds
            break

        page = doc[page_index]  # Get the existing page instead of creating a new one

        for block in page_data["text_blocks"]:
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"]
                    font_size = span["font_size"]
                    font_name = span.get("font", "helv")  # Default to Helvetica if missing
                    x0, y0, x1, y1 = span["bbox"]
                    rotation = span.get("rotation", 0)  # Extract rotation

                    # Calculate the insertion point (bottom-left corner of the bounding box)
                    insert_point = fitz.Point(x0, y1)

                    # Apply the extracted rotation angle
                    transform_matrix = fitz.Matrix(rotation)

                    # Insert text on the existing page with transformation
                    page.insert_text(
                        insert_point,
                        text,
                        fontsize=font_size,
                        #fontname=font_name,
                        morph=(insert_point, transform_matrix)
                    )

    # Save the modified PDF
    doc.save(output_pdf)
    print(f"✅ Text successfully mapped onto {output_pdf}")

# Example usage
overlay_text_on_pdf("sbi_output_no_text.pdf", "extracted_data.json", "final_output.pdf")


✅ Text successfully mapped onto final_output.pdf


#### the new extracting is mapped (including the font color and boldness)

In [97]:
import fitz  # PyMuPDF
import json

def overlay_text_on_pdf(input_pdf, json_path, output_pdf):
    # Load extracted text data
    with open(json_path, "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    # Open the existing PDF (output_no_text.pdf)
    doc = fitz.open(input_pdf)

    for page_index, page_data in enumerate(extracted_data):
        if page_index >= len(doc):  # Ensure we don't go out of bounds
            break

        page = doc[page_index]  # Get the existing page instead of creating a new one

        for block in page_data["text_blocks"]:
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"]
                    font_size = span["font_size"]
                    font_name = span.get("font", "helv")  # Default to Helvetica if missing
                    x0, y0, x1, y1 = span["bbox"]
                    rotation = span.get("rotation", 0)  # Extract rotation
                    
                    # Extract font color
                    text_color = span.get("color", (0, 0, 0))  # Default to black
                    r, g, b = text_color  # Extract individual RGB values
                    
                    # Check if the font is bold
                    is_bold = span.get("bold", False)
                    if is_bold:
                        font_name += "-Bold"  # Append '-Bold' for bold fonts

                    # Calculate the insertion point (bottom-left corner of the bounding box)
                    insert_point = fitz.Point(x0, y1)

                    # Apply the extracted rotation angle
                    transform_matrix = fitz.Matrix(rotation)

                    # Insert text on the existing page with transformation
                    page.insert_text(
                        insert_point,
                        text,
                        fontsize=font_size,
                        #fontname=font_name,
                        color=(r / 255, g / 255, b / 255),  # Normalize RGB (0-1)
                        morph=(insert_point, transform_matrix)
                    )

    # Save the modified PDF
    doc.save(output_pdf)
    print(f"✅ Text successfully mapped onto {output_pdf}")

# Example usage
overlay_text_on_pdf("sbi_output_no_text.pdf", "extracted_data.json", "final_output.pdf")


✅ Text successfully mapped onto final_output.pdf


# markdown of problematic pdf

# Introducing aatmanirbhar SIP  

Aims to make you Aatmanirbhar to live the life you deserve  

Systematic Investment Plan  

![](https://cdn-mineru.openxlab.org.cn/extract/dc7c987f-552c-4156-941e-6f0e3af7b596/4a0d90beab56aae5c20ba1e9c173715f30e806d52de300d748dff6e3c4560453.jpg)  

![](https://cdn-mineru.openxlab.org.cn/extract/dc7c987f-552c-4156-941e-6f0e3af7b596/f5e18922858f1494da94876d34ac9cb7d1959e88c27b99f9286d704e99d5a336.jpg)  

## Illustration to explain concept of Aatmanirbhar SIP  

simran began her journey towards financial independence in 2003 with a monthly SIP of  30,000 in Nifty 500 TRI benchmark she stayed invested for 10 years consistently, irrespective of market conditions and accumulated a corpus of approx 68 Lakl  

![](https://cdn-mineru.openxlab.org.cn/extract/dc7c987f-552c-4156-941e-6f0e3af7b596/4fd6f84e71961a0ae8267a3a206d268a47a9d4bcce53366f79119d067dab0039.jpg)  

![](https://cdn-mineru.openxlab.org.cn/extract/dc7c987f-552c-4156-941e-6f0e3af7b596/f60feca5356145f50acb9d07a29b0d9eeb4a2a9bcc2c0bf4de286a6e66600c2c.jpg)  

![](https://cdn-mineru.openxlab.org.cn/extract/dc7c987f-552c-4156-941e-6f0e3af7b596/38e1987f62bc19beb77427007ed49c9f2cdabe6bd455df3ce6397722689cfe55.jpg)  

## SWP  

SimrandecidedtogowithSWPbecause it would provideher with passive income whilestillleavingherwith availablefunds.  

<html><body><table><tr><td>InvestingSince</td><td>01/04/2003</td></tr><tr><td>SIPPermonth</td><td>30,000</td></tr><tr><td>InvestingPeriod(Yrs)</td><td>10</td></tr><tr><td>InvestedAmount</td><td>36,00,000</td></tr><tr><td>MarketValue(Apr1,2013)</td><td>68,87,718</td></tr><tr><td>SIPReturns(%XIRR)</td><td>12.40%</td></tr></table></body></html>  

![](https://cdn-mineru.openxlab.org.cn/extract/dc7c987f-552c-4156-941e-6f0e3af7b596/a89d3f51061d01430abdbd24f40a599dcbad81abe2ef9642e7404138a93b8159.jpg)  
Source:Bloomberg/axismfresearch.Dataason:30thMay,2023. Past performance may ormay not be sustainedinfuture.Thecalculation is a historicalcalculationofahypothetical 30000 monthlySIPinitiated on01/04/2003for10 years.Upon completion the corpusistransferredvia switchto NIFTY50 Hybrid CompositeDebt65:35Indexwith asubsequent systematicwithdrawal of $\yen300000$ forthenext10years.Valueatth end ofthe period as of 30th April 2023.Expenses forthe execution, maintenance ofthefund and taxation have been ignored in thiscalculation. Theabove calculationsareonlyforilustrationpurposesand are subject tomarketrisksbased oncorpus at the end of the investment period actualmarket returnsand periodicity a cashflows.Thisfeature doesnotinanywaygive assuranceoftheperformance of any ofthe Schemes of Axis Mutual Fund or provide any guarantee of withdrawals through SWP mode. Investors are advised to consult their investment/taxadvisors beforeinvesting.  

<html><body><table><tr><td rowspan="5">Withdrawalsince MonthlyPayout Marketvalueatthestartofwithdrawal</td><td>01/04/2013</td></tr><tr><td>30,000</td></tr><tr><td>68,87,718</td></tr><tr><td>10</td></tr><tr><td>36,00,000</td></tr><tr><td>TotalMoneyWithdrawn Balancecorpusafter10years ofmonthlypayout</td><td>1,44,03,464</td></tr></table></body></html>  

# pdfplumber

In [8]:
import pdfplumber
import json

def extract_text_info(pdf_path):
    extracted_data = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for block in page.extract_words():
                data = {
                    "left": block["x0"],
                    "top": block["top"],
                    "end_left": block["x1"],
                    "end_top": block["bottom"],
                    "total_texts": 1,
                    "is_bullet": False,
                    "type": "normal",
                    "text1": "",
                    "text": block["text"],
                    "font_family": "Unknown",  # pdfplumber doesn't extract font family
                    "font_size": 8.0,  # Adjust if needed
                    "font_color": 0,  # Requires extra processing
                    "font_color_hex": "#000000",
                    "font_style": "normal",
                    "is_vertical": False,
                    "angle": 0,
                    "superscript": False
                }
                extracted_data.append(data)
    
    return extracted_data

pdf_path = "Aatmanirbhar Org.pdf"  # Replace with the actual file path
data = extract_text_info(pdf_path)

# Save as JSON
with open("extracted_text.json", "w") as f:
    json.dump(data, f, indent=4)

print("Extraction complete. Data saved to extracted_text.json")


Extraction complete. Data saved to extracted_text.json


# MEET THE INNOVATORS WHO WILL POWER NEXT GEN INDIA  

![](https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/21bdc3896e18f5c84f4d7a94c16be36c00969f2aa6ff49378e2bdabf53d9ca84.jpg)  

नवाचार नए विचारों, प्रौद्योगिकियों और प्रक्रियाओं को प्रस्तुत करके विकास को गति देता है, जो परंपरागत मानदंडों को चुनौती देते हैं। जो कंपनियां नवाचार को अपनाती हैं, वे प्रतिस्पर्धात्मक बढ़त प्राप्त करती हैं और अपने क्षेत्रों के भविष्य को आकार देती हैं।


![](https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/d79ea84e2ea7f1df0dee5ddb9afe75ac45b9324507f6441077bbeb1551a2e2a5.jpg)  

## FACTORSINFLUENCINGINDIAINNOVATIONSTORY  

$\odot$ Booming Startup Ecosystem $\circledcirc$ Inherent Talent Pool OGovernment Initiatives  

$\odot$ Growing Consumer Market $\pmb{\odot}$ Strong Digital Infrastructure $\pmb{\odot}$ Strong Funding Ecosystem  

ndia's position in the Global Innovation Index\* has improved to $40^{\mathrm{th}}$ rank in 2023 from $81^{\mathrm{st}}$ in 2015. Due to the above favorable tailwinds, we believe ndia is currently at the cusp of an innovation cycle.  

Source: SBIMF Research.\* Published by the World Intellectual Property Organization  

## भारत की नवाचार कहानी को प्रभावित करने वाले कारक  

$\odot$ फलता-फूलता स्टार्टअप इकोसिस्टम \\  
$\circledcirc$ स्वाभाविक प्रतिभा पूल \\  

Oसरकारी पहल \\  
$\odot$ बढ़ता उपभोक्ता बाजार \\  

$\pmb{\odot}$ मजबूत डिजिटल अवसंरचना \\  
$\pmb{\odot}$ मजबूत वित्तीय पारिस्थितिकी तंत्र \\  

भारत की वैश्विक नवाचार सूचकांक\* में स्थिति 2015 में $81^{\mathrm{st}}$ स्थान से सुधार कर 2023 में $40^{\mathrm{th}}$ स्थान पर पहुंच गई है।  

उपरोक्त अनुकूल परिस्थितियों के कारण, हमें विश्वास है कि भारत वर्तमान में एक नवाचार चक्र के कगार पर है।  

स्रोत: SBIMF अनुसंधान।\* विश्व बौद्धिक संपदा संगठन द्वारा प्रकाशित  

## भारत की नवाचार कहानी को प्रभावित करने वाले कारक  

$\odot$ फलता-फूलता स्टार्टअप इकोसिस्टम \ \ $\circledcirc$ स्वाभाविक प्रतिभा पूल \\  

Oसरकारी पहल \ \ $\odot$ बढ़ता उपभोक्ता बाजार \\  

$\pmb{\odot}$ मजबूत डिजिटल अवसंरचना \ \ $\pmb{\odot}$ मजबूत वित्तीय पारिस्थितिकी तंत्र \\  

भारत की वैश्विक नवाचार सूचकांक\* में स्थिति 2015 में $81^{\mathrm{st}}$ स्थान से सुधार कर 2023 में $40^{\mathrm{th}}$ स्थान पर पहुंच गई है।  

उपरोक्त अनुकूल परिस्थितियों के कारण, हमें विश्वास है कि भारत वर्तमान में एक नवाचार चक्र के कगार पर है।  

स्रोत: SBIMF अनुसंधान।\* विश्व बौद्धिक संपदा संगठन द्वारा प्रकाशित  

## भारत की नवाचार कहानी को प्रभावित करने वाले कारक  

$\odot$ फलता-फूलता स्टार्टअप इकोसिस्टम \ \ $\circledcirc$ स्वाभाविक प्रतिभा पूल \\  

Oसरकारी पहल \ \ $\odot$ बढ़ता उपभोक्ता बाजार \\  

$\pmb{\odot}$ मजबूत डिजिटल अवसंरचना \ \ $\pmb{\odot}$ मजबूत वित्तीय पारिस्थितिकी तंत्र \\  

भारत की वैश्विक नवाचार सूचकांक\* में स्थिति 2015 में $81^{\mathrm{st}}$ स्थान से सुधार कर 2023 में $40^{\mathrm{th}}$ स्थान पर पहुंच गई है।  

उपरोक्त अनुकूल परिस्थितियों के कारण, हमें विश्वास है कि भारत वर्तमान में एक नवाचार चक्र के कगार पर है।  

स्रोत: SBIMF अनुसंधान।\* विश्व बौद्धिक संपदा संगठन द्वारा प्रकाशित  
 




## O INVESTMENTSTRATEGY  

![](https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/f7cb8897434809ed9a071735adeecf1023221f4a05a07176bf313e556f6b5159.jpg)  

<span>
<bold>Product / Service Innovators :<bold> Companies that develop new products or services or significantly invest in R&D for new innovations.   
They challenge existing markets or create entirely new categories.  

Process Innovators : Companies that innovate new processes, potentially disrupting existing business models and gaining market share through technological and process advancements.  

InnovationAdaptors $:$ Incumbent companies that adapt to innovative business models, products, or services within their industry, showing agility in response to emergent trends. These adaptive innovators may not necessarily overhaul their entire business model but exhibit innovative strategies in specific segments or verticals that has potential to meaningfully impact the business.  

Each category presents opportunities and rsks, which wil guide investment decisions. For detailed investmentstrategy please refer Scheme Information Document carefully.  

### PORTFOLIOCONSTRUCTIONAPPROACH  

·Min $80\%$ of net assets investing into companies falling into innovation theme buckets   
$\bullet$ Upto $35\%$ of net assets investing into global stocks aligned with the underlying theme   
$\bullet$ True to its label diversified portfolio investing across sectors & Market cap   
$\bullet$ Aims to have a portfolio of \~35-40 stocks with bottom-up stock selection approach Companies with long runway for growth, competitive advantage, potential for generating strong ROE & cashflows etc  

Further, to achieve diversification the Scheme may also invest residual net assets i.e. up to $20\%$ of the net assets in companies other than the companies following innovation theme This is based on the prevailing market conditions & current views and is subject to change within the limits of the SiD basis the fund manager's view.  

### O WHOCANINVESTINSBIINNOVATIVEOPPORTUNITIESFUND?  

![](https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/bea6b342ed91c5ae2031fafc4b8a8c6fae9c2e96b343f8072ee3a3b36d6da4a3.jpg)  

#  

Investors looking for a fund that consistently seeks out investment opportunities in upcoming trends and disruptive business ideas will find this fund true to label. Investors seeking capital appreciation through investments in forward-looking business ideas and a growth-oriented long-term portfolio 2 willfind this fund appealing. Investors seeking diversification through portfolio allocation across sectors and market caps, offering a unique avenue to tap into emerging trends and markets.  

### ABOUTSBIINNOVATIVEOPPORTUNITIESFUND  

#### Investment Objective:  

The investment objective of the scheme is to provide investors with opportunities for long term capital appreciation by investing in equity and equity related instruments of companies that seeks to benefit from adoption of innovative strategies & theme. However, there can be no assurance that the investment objective of the Scheme will be realized.  

· Fund Manager&: Mr. Prasad Padala · Category: Thematic · Minimum Application^: 50o0/- & in multiples of $\yen1$ thereafter · First Tier Banchmark Index: NIFTY 500 TRI $\bullet$ MinimumMonthly ${\mathsf{s l P}}^{\star}$ :?500/-& inmultiples of 1  

#### Exit Load:  

·ForOngoingbasis: $1\%$ of theapplicableNAv $\cdot$ If units purchased or switched in from another scheme of the fund are redeemed or switched out onorbefore1yearfromthedateof allotment. · NIL - If units purchased or switched in from another scheme of the fund are redeemed or switched out after 1 year from the date of allotment.  

For details, please refer to the Scheme Information Document (SiD). &Mr. Pradeep Kesavan is the dedicated fund manager for overseas securities. ^Additional Purchase: Rs. 1000 and in multiples of Re.1 thereafter. \*For detailed minimum amount of SIP across frequencies & number of installments, please refer to SID/KIM.  

## SBI INNOVATIVE OPPORTUNITIES FUND  

![](https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/a8f17133806929132e9e018f3f86a56f0071c7539257951d39851fdf380df56a.jpg)  

An open-ended equity scheme following the innovationtheme.  

This leaflet is for information purposes only and is not an offer to sell or a solicitation to buy any mutual fund units/securities. The views expressed herein are based on the basis of internal data, publicly available information & other sources believed to be reliable. Any calculations made are approximations meant as guidelines only, which need to be confirmed before relying on them. These views alone are not sufficient and should not be used for the development or implementation of an investment strategy. It should not be construed as investment advice to any party. All opinions and estimates included here constitute our view as of this date and are subject to change without notice. Neither SBl Funds Management Limited, SBI Mutual Fund nor any person connected with it, accepts any liability arising from the use of this information. The recipient of this material should rely on their investigations and take their own professional advice.  

# Contact your MFD/RIA Visit: www.sbimf.com Follow us: +×回 in  

Mutual Fund investments are subject to market risks, read all scheme related documents carefully.  

# miner u api

In [6]:
x = "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiIyMTAwMjc3NSIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc0MTg2ODk5NSwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwidXVpZCI6ImM2YWExZjk2LTY5ZjMtNDNlMC1hYzcyLTU1MWM4YzA1Y2ZmNyIsImVtYWlsIjoiIiwiZXhwIjoxNzQzMDc4NTk1fQ.oSJ_jubwoX14Pji1gOi2kUaYCTv4GWRSmyNp4O5jkJ8jXG-wYFN_s03n5Iy8JOUGn9hVOuzVP4QZMdCBSHXIvA"

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [7]:
x

'eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiIyMTAwMjc3NSIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc0MTg2ODk5NSwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwidXVpZCI6ImM2YWExZjk2LTY5ZjMtNDNlMC1hYzcyLTU1MWM4YzA1Y2ZmNyIsImVtYWlsIjoiIiwiZXhwIjoxNzQzMDc4NTk1fQ.oSJ_jubwoX14Pji1gOi2kUaYCTv4GWRSmyNp4O5jkJ8jXG-wYFN_s03n5Iy8JOUGn9hVOuzVP4QZMdCBSHXIvA'

using the api key

In [17]:
import requests

url='https://mineru.net/api/v4/extract/task'
header = {
    'Content-Type':'application/json',
    "Authorization":'eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiIyMTAwMjc3NSIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc0MTg2OTQxNiwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwidXVpZCI6IjlmMWQ4ZmJiLTk5MzgtNDFiMy05MDZjLTcxNWYzNDFkN2U2YSIsImVtYWlsIjoiIiwiZXhwIjoxNzQzMDc5MDE2fQ.lIh5brncncJgioW3d_oyTgTy5PTOh7aNGlEP_uLgpFYCndSZYp0aqrnnnbdtO2WPPcKGyTp3BYCaUP_nMV5AAA'
}
data = {
    'url':'https://cdn-mineru.openxlab.org.cn/demo/example.pdf',
    'is_ocr':True,
    'enable_formula': False,
}

res = requests.post(url,headers=header,json=data)
print(res.status_code)
print(res.json())
print(res.json()["data"])

401
{'traceId': 'e7f6db99b052', 'msgCode': 'A0202', 'msg': 'user authenticate failed', 'data': None, 'success': False, 'total': 0}
None


eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiIyMTAwMjc3NSIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc0MTg2OTQxNiwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwidXVpZCI6IjlmMWQ4ZmJiLTk5MzgtNDFiMy05MDZjLTcxNWYzNDFkN2U2YSIsImVtYWlsIjoiIiwiZXhwIjoxNzQzMDc5MDE2fQ.lIh5brncncJgioW3d_oyTgTy5PTOh7aNGlEP_uLgpFYCndSZYp0aqrnnnbdtO2WPPcKGyTp3BYCaUP_nMV5AAA

In [8]:
import requests

url='https://mineru.net/api/v4/file-urls/batch'
header = {
    'Content-Type':'application/json',
    "Authorization":x
}
data = {
    "enable_formula": True,
    "language": "en",
    "layout_model":"doclayout_yolo",
    "enable_table": True,
    "files": [
        {"name":"demo.pdf", "is_ocr": True, "data_id": "abcd"}
    ]
}
file_path = r"demo.pdf"
try:
    response = requests.post(url,headers=header,json=data)
    if response.status_code == 200:
        result = response.json()
        print('response success. result:{}'.format(result))
        if result["code"] == 0:
            batch_id = result["data"]["batch_id"]
            urls = result["data"]["file_urls"]
            print('batch_id:{},urls:{}'.format(batch_id, urls))
            with open(file_path, 'rb') as f:
                res_upload = requests.put(urls[0], data=f)
            if res_upload.status_code == 200:
                print("upload success")
            else:
                print("upload failed")
        else:
            print('apply upload url failed,reason:{}'.format(result.msg))
    else:
        print('response not success. status:{} ,result:{}'.format(response.status_code, response))
except Exception as err:
    print(err)

response not success. status:401 ,result:<Response [401]>


# Tesseract ocr

In [1]:
# from google.cloud import vision
# import io
# import requests

# def google_vision_ocr(image_url):
#     """Extract text from an image using Google Cloud Vision API"""
#     client = vision.ImageAnnotatorClient()

#     # Download the image
#     response = requests.get(image_url)
#     image = vision.Image(content=response.content)

#     # Perform text detection
#     result = client.text_detection(image=image)
#     texts = result.text_annotations

#     extracted_text = texts[0].description if texts else "No text found"

#     print("Extracted Text:")
#     print(extracted_text)

# # 🔹 Call the function with your image URL
# image_url = "https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/21bdc3896e18f5c84f4d7a94c16be36c00969f2aa6ff49378e2bdabf53d9ca84.jpg"
# google_vision_ocr(image_url)


# Example

In [5]:
import json

# Given JSON objects
json_objects = [
    {
        "left": 227.0359000000003,
        "top": 475.2403999999999,
        "end_left": 275.6119000000004,
        "end_top": 483.2403999999999,
        "total_texts": 1,
        "is_bullet": False,
        "type": "normal",
        "text1": "",
        "text": "technologies,",
        "font_family": "Unknown",
        "font_size": 8.0,
        "font_color": 0,
        "font_color_hex": "#000000",
        "font_style": "normal",
        "is_vertical": False,
        "angle": 0,
        "superscript": False
    },
    {
        "left": 278.17430000000047,
        "top": 475.2403999999999,
        "end_left": 291.7583000000004,
        "end_top": 483.2403999999999,
        "total_texts": 1,
        "is_bullet": False,
        "type": "normal",
        "text1": "",
        "text": "and",
        "font_family": "Unknown",
        "font_size": 8.0,
        "font_color": 0,
        "font_color_hex": "#000000",
        "font_style": "normal",
        "is_vertical": False,
        "angle": 0,
        "superscript": False
    },
    {
        "left": 294.32150000000047,
        "top": 475.2403999999999,
        "end_left": 331.73750000000047,
        "end_top": 483.2403999999999,
        "total_texts": 1,
        "is_bullet": False,
        "type": "normal",
        "text1": "",
        "text": "processes",
        "font_family": "Unknown",
        "font_size": 8.0,
        "font_color": 0,
        "font_color_hex": "#000000",
        "font_style": "normal",
        "is_vertical": False,
        "angle": 0,
        "superscript": False
    }
]

# Given sentences
sentences = ["technologies, and processes", "some other sentence"]

# Step 1: Extract words from JSON
word_sequence = " ".join([obj["text"] for obj in json_objects])

# Step 2: Find which words belong to a sentence
matching_sentence = None
for sentence in sentences:
    if sentence in word_sequence:
        matching_sentence = sentence
        break

# Step 3: Create new JSON object using attributes of the first word
if matching_sentence:
    first_obj = json_objects[0]  # Take attributes from the first word object
    new_json_object = first_obj.copy()  # Clone it
    new_json_object["text"] = matching_sentence  # Replace text with the full sentence

    # Output result
    print(json.dumps(new_json_object, indent=4))


{
    "left": 227.0359000000003,
    "top": 475.2403999999999,
    "end_left": 275.6119000000004,
    "end_top": 483.2403999999999,
    "total_texts": 1,
    "is_bullet": false,
    "type": "normal",
    "text1": "",
    "text": "technologies, and processes",
    "font_family": "Unknown",
    "font_size": 8.0,
    "font_color": 0,
    "font_color_hex": "#000000",
    "font_style": "normal",
    "is_vertical": false,
    "angle": 0,
    "superscript": false
}


# Text returns


<span>
<bold>Product / Service Innovators :<bold> Companies that develop new products or services or significantly invest in R&D for new innovations.   
They challenge existing markets or create entirely new categories.  

Process Innovators : Companies that innovate new processes, potentially disrupting existing business models and gaining market share through technological and process advancements.  

InnovationAdaptors $:$ Incumbent companies that adapt to innovative business models, products, or services within their industry, showing agility in response to emergent trends. These adaptive innovators may not necessarily overhaul their entire business model but exhibit innovative strategies in specific segments or verticals that has potential to meaningfully impact the business.  

Each category presents opportunities and rsks, which wil guide investment decisions. For detailed investmentstrategy please refer Scheme Information Document carefully.  

### PORTFOLIOCONSTRUCTIONAPPROACH  

·Min $80\%$ of net assets investing into companies falling into innovation theme buckets   
$\bullet$ Upto $35\%$ of net assets investing into global stocks aligned with the underlying theme   
$\bullet$ True to its label diversified portfolio investing across sectors & Market cap   
$\bullet$ Aims to have a portfolio of \~35-40 stocks with bottom-up stock selection approach Companies with long runway for growth, competitive advantage, potential for generating strong ROE & cashflows etc  

Further, to achieve diversification the Scheme may also invest residual net assets i.e. up to $20\%$ of the net assets in companies other than the companies following innovation theme This is based on the prevailing market conditions & current views and is subject to change within the limits of the SiD basis the fund manager's view.  

# LLm processing

In [None]:
sentences = [
    "Product / Service Innovators: Companies that develop new products or services or significantly invest in R&D for new innovations. They challenge existing markets or create entirely new categories.",
    "Process Innovators: Companies that innovate new processes, potentially disrupting existing business models and gaining market share through technological and process advancements.",
    "Innovation Adaptors: Incumbent companies that adapt to innovative business models, products, or services within their industry, showing agility in response to emergent trends. These adaptive innovators may not necessarily overhaul their entire business model but exhibit innovative strategies in specific segments or verticals that have the potential to meaningfully impact the business.",
    "Each category presents opportunities and risks, which will guide investment decisions.",
    "For a detailed investment strategy, please refer to the Scheme Information Document carefully.",
    "Portfolio Construction Approach: At least 80% of net assets will be invested in companies falling into innovation theme buckets.",
    "Up to 35% of net assets will be invested in global stocks aligned with the underlying theme.",
    "The portfolio will be diversified, investing across sectors and market capitalization.",
    "The fund aims to maintain a portfolio of approximately 35-40 stocks with a bottom-up stock selection approach.",
    "Companies with a long runway for growth, competitive advantage, and potential for generating strong ROE and cash flows will be prioritized.",
    "To achieve diversification, the scheme may invest up to 20% of net assets in companies outside the innovation theme based on prevailing market conditions and fund manager discretion."
]


![](https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/d79ea84e2ea7f1df0dee5ddb9afe75ac45b9324507f6441077bbeb1551a2e2a5.jpg) 

In [None]:
import pytesseract
from PIL import Image
import requests
from io import BytesIO

# Function to perform OCR
def extract_text_from_image(image_url):
    # Download image from URL
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content))

    # Perform OCR
    extracted_text = pytesseract.image_to_string(image)

    print("Extracted Text:")
    print(extracted_text)

# Test with your image URL
image_url = "https://cdn-mineru.openxlab.org.cn/extract/c0da4e40-b104-4816-9e2d-01cc8618cf3f/d79ea84e2ea7f1df0dee5ddb9afe75ac45b9324507f6441077bbeb1551a2e2a5.jpg"
extract_text_from_image(image_url)

Automobiles

Electric Vehicles
Self Drive cars
Hybrid vehicles
Energy

Hydrogen
Battery Storage
Grid integration
Media & Entertainment

OTT
Digital Content & Ads
Music streaming
Industrials

Robots & Drones
3D printing
Nanotechnology
Financial Services

UPI
Block chain
Payment Aggregator
Consumption

Quick Commerce
Augmented reality
Omnichannel integration
Technology

Cloud computing
AI & IoT
Data centers
Healthcare

Medtech
E-Pharmacy
Biotechnology

In [7]:
data = [
    "Automobiles: Electric Vehicles, Self Drive cars, Hybrid vehicles",
    "Energy: Hydrogen, Battery Storage, Grid integration",
    "Media & Entertainment: OTT, Digital Content & Ads, Music streaming",
    "Industrials: Robots & Drones, 3D printing, Nanotechnology",
    "Financial Services: UPI, Block chain, Payment Aggregator",
    "Consumption: Quick Commerce, Augmented reality, Omnichannel integration",
    "Technology: Cloud computing, AI & IoT, Data centers",
    "Healthcare: Medtech, E-Pharmacy, Biotechnology"
]

print(data)


['Automobiles: Electric Vehicles, Self Drive cars, Hybrid vehicles', 'Energy: Hydrogen, Battery Storage, Grid integration', 'Media & Entertainment: OTT, Digital Content & Ads, Music streaming', 'Industrials: Robots & Drones, 3D printing, Nanotechnology', 'Financial Services: UPI, Block chain, Payment Aggregator', 'Consumption: Quick Commerce, Augmented reality, Omnichannel integration', 'Technology: Cloud computing, AI & IoT, Data centers', 'Healthcare: Medtech, E-Pharmacy, Biotechnology']
