In [9]:
from PIL import Image
import pytesseract
import fitz  # PyMuPDF
import pandas as pd
import re

In [17]:
def ocr_pdf_to_dataframe(pdf_path, dpi=300):
    """
    Extracts text from a scanned PDF and stores it in a pandas DataFrame.
    Args:
        pdf_path (str): Path to the scanned PDF file.
        dpi (int): Resolution for rendering PDF pages (higher DPI = better OCR).

    Returns:
        pd.DataFrame: A DataFrame with pages and their extracted content.
    """
    data = []  # Store page number and content

    with fitz.open(pdf_path) as pdf:
        for page_num in range(len(pdf)):
            # Convert the page to an image
            pix = pdf[page_num].get_pixmap(dpi=dpi)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            
            # Perform OCR on the image
            page_text = pytesseract.image_to_string(img)
            
            # Append to the data list
            data.append({"page": page_num + 1, "content": page_text})
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df


NameError: name 'data' is not defined

In [6]:
# Specify the path to your scanned PDF
pdf_path = "data/toc.pdf"  # Replace with your file path

# Extract text from the scanned PDF
df = ocr_pdf_to_dataframe(pdf_path)

# Display the first few rows of the DataFrame
df.head()



Unnamed: 0,page,content
0,1,CHAPTER 2 - STANDING\n\nDEFENSES FROM BEHIND\n...
1,2,CHAPTER 4 - THE MOUNT\n\n160\n162\n164\n166\n1...


In [8]:
df.to_csv("data/extracted_text.csv", index=False)

In [10]:
# clean up the DataFrame by removing any rows with empty content
def clean_text(text):
    """
    Cleans OCR-extracted text by removing extra whitespace, line breaks, and special characters.
    Args:
        text (str): Raw text extracted from OCR.

    Returns:
        str: Cleaned text.
    """
    # Remove extra whitespace and line breaks
    text = re.sub(r'\s+', ' ', text).strip()
    # Optionally remove special characters (adjust this as needed)
    text = re.sub(r'[^\w\s.,]', '', text)
    return text

In [19]:
# Apply the cleaning function to the content column
df['cleaned_content'] = df['content'].apply(clean_text)

In [23]:
df

Unnamed: 0,page,content,cleaned_content
0,1,CHAPTER 2 - STANDING\n\nDEFENSES FROM BEHIND\n...,CHAPTER 2 STANDING DEFENSES FROM BEHIND 98 R...
1,2,CHAPTER 4 - THE MOUNT\n\n160\n162\n164\n166\n1...,CHAPTER 4 THE MOUNT 160 162 164 166 168 Upwa...


In [21]:
df.to_csv("data/cleaned_extracted_text.csv", index=False)

In [30]:
# Function to parse the DataFrame content
def parse_defenses(df):
    data = []  # List to store the structured data
    current_defense = None  # Track the current ALL CAPS defense category

    # Split content into lines for processing
    lines = df['content'].str.split('\n').explode().dropna().reset_index(drop=True)
    
    buffer = ""  # Buffer to handle multiline techniques
    
    for line in lines:
        line = line.strip()  # Remove leading/trailing whitespace

        if not line:  # Skip empty lines
            continue

        # Check for ALL CAPS lines (Defense category)
        if line.isupper():
            current_defense = line
        else:
            # Match techniques with page numbers (e.g., "98 — Rear choke defense")
            match = re.match(r"(\d+)\s*[—~\-]\s*(.+)", line)
            if match:
                # If there's buffered text, append it to the previous technique
                if buffer:
                    data[-1]['Technique'] += f" {buffer}"
                    buffer = ""

                # Extract page number and technique
                page = int(match.group(1))
                technique = match.group(2).strip()

                # Add to data
                data.append({
                    "Defense": current_defense,
                    "Technique": technique,
                    "Page": page
                })
            else:
                # If the line doesn't match a page number, buffer it (multiline technique)
                buffer += f" {line.strip()}"

    # If there's leftover buffer, append it to the last technique
    if buffer and data:
        data[-1]['Technique'] += f" {buffer}"

    # Convert to a DataFrame
    return pd.DataFrame(data)

In [31]:
df = pd.read_csv("data/extracted_text.csv")

# Parse the content
parsed_df = parse_defenses(df)

# Display the structured DataFrame
print(parsed_df)

                  Defense                                          Technique  \
0    DEFENSES FROM BEHIND                                 Rear choke defense   
1    DEFENSES FROM BEHIND  Rear choke defense pulled back  (with side throw)   
2    DEFENSES FROM BEHIND  Rear choke defense pulled back  (with overhead...   
3    DEFENSES FROM BEHIND                        Rear two-hand choke defense   
4    DEFENSES FROM BEHIND               Rear bear hug over the arms  defense   
..                    ...                                                ...   
75  CHAPTER 4 - THE MOUNT                         Cross choke (thumb inside)   
76  CHAPTER 4 - THE MOUNT                                   Nutcracker choke   
77  CHAPTER 4 - THE MOUNT                                         Neck crank   
78  CHAPTER 4 - THE MOUNT    Keeping the mount (opponent  pushing the chest)   
79  CHAPTER 4 - THE MOUNT  Keeping the mount (opponent  — Elbow escape dr...   

    Page  
0     98  
1    100  
2    1

In [32]:
parsed_df.to_csv("data/toc.csv", index=False)

In [33]:
from PIL import Image
import pytesseract

# Load the image
image_path = "data/toc.png"  # Replace with your image file path
image = Image.open(image_path)

# Perform OCR
extracted_text = pytesseract.image_to_string(image)

# Print the extracted text
print(extracted_text)



58 — Headlock defense (pressure on the nose

60 — Headlock defense against a wall
(attacker outside)

62 — Headlock defense against a wall
(attacker inside)

64 — Guillotine defense (sitting back)

66 — Guillotine defense (sliding down)

68 — Guillotine defense (with trip takedown)

70 — Guillotine choke

72 — Guillotine choke (if taken down)

74 — Tackle defense (with elbow strike)

75 — Tackle defense (with knee strike)

76 — Front bear hug over the arms defense

78 — Front bear hug under arms defense
(with frame)

79 — Front bear hug under arms defense
(with chin push)

80 — Sucker punch defense

82 — Haymaker punch defense

84 — Front kick defense

86 — Front kick defense (with pivot)

87 — Front kick defense (high)

88 — Roundhouse kick defense

90 — Standing up in base

92 — Hand chop

93 — Punch

94 — Elbow strike

95 — Side kick (surprise attack)

95 — Side kick

)



In [52]:
lines = extracted_text.split('\n')

data_1 = []  # List to store the structured data
for line in lines:
    line = line.strip()
    data_1.append(line)
    

for index, line in enumerate(data_1):
    print(f"{index + 1}: {line}")
    

1: 58 — Headlock defense (pressure on the nose
2: 
3: 60 — Headlock defense against a wall
4: (attacker outside)
5: 
6: 62 — Headlock defense against a wall
7: (attacker inside)
8: 
9: 64 — Guillotine defense (sitting back)
10: 
11: 66 — Guillotine defense (sliding down)
12: 
13: 68 — Guillotine defense (with trip takedown)
14: 
15: 70 — Guillotine choke
16: 
17: 72 — Guillotine choke (if taken down)
18: 
19: 74 — Tackle defense (with elbow strike)
20: 
21: 75 — Tackle defense (with knee strike)
22: 
23: 76 — Front bear hug over the arms defense
24: 
25: 78 — Front bear hug under arms defense
26: (with frame)
27: 
28: 79 — Front bear hug under arms defense
29: (with chin push)
30: 
31: 80 — Sucker punch defense
32: 
33: 82 — Haymaker punch defense
34: 
35: 84 — Front kick defense
36: 
37: 86 — Front kick defense (with pivot)
38: 
39: 87 — Front kick defense (high)
40: 
41: 88 — Roundhouse kick defense
42: 
43: 90 — Standing up in base
44: 
45: 92 — Hand chop
46: 
47: 93 — Punch
48: 
49

(attacker outside)


In [34]:
lines = extracted_text.split('\n')

# Step 2: Parse lines to extract page and technique
data = []
buffer = ""
current_page = None

for line in lines:
    line = line.strip()
    
    if not line:
        continue
    
    # Match lines with page numbers and techniques
    match = re.match(r"(\d+)\s*[—~\-]\s*(.+)", line)
    if match:
        if buffer:
            data[-1]['Technique'] += f" {buffer.strip()}"
            buffer = ""
        
        page = int(match.group(1))
        technique = match.group(2).strip()
    else:
        # Buffer multiline techniques
        buffer += f" {line.strip()}"
        
        data.append({
            "Page": page,
            "Technique": technique
        })
        
if buffer and data:
    data[-1]['Technique'] += f" {buffer.strip()}"

In [38]:
print(data)
print("#########")
len(data)
len(lines)

[{'Page': 60, 'Technique': 'Headlock defense against a wall (attacker outside)'}, {'Page': 62, 'Technique': 'Headlock defense against a wall (attacker inside)'}, {'Page': 78, 'Technique': 'Front bear hug under arms defense (with frame)'}, {'Page': 79, 'Technique': 'Front bear hug under arms defense (with chin push)'}, {'Page': 95, 'Technique': 'Side kick )'}]
#########


56

In [36]:
# Step 3: Convert to DataFrame
df = pd.DataFrame(data)

# Display the structured DataFrame
print(df)


   Page                                          Technique
0    60  Headlock defense against a wall (attacker outs...
1    62  Headlock defense against a wall (attacker inside)
2    78     Front bear hug under arms defense (with frame)
3    79  Front bear hug under arms defense (with chin p...
4    95                                        Side kick )
