1: Convert pdf to an image. Store the page as an image  

2: img processing techniques like grayscale and diagram removal  

3: bounding boxes per word  

4: combine boxes into lines of text  


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import cv2
import pandas as pd
from PIL import Image
import PyPDF2
import pytesseract
from pdf2image import convert_from_path


Must install poppler: https://github.com/oschwartz10612/poppler-windows/releases  
Follow directions to add poppler:  
- Download poppler as .zip, then extract all
- Press Win + R, type sysdm.cpl, hit Enter.
- Go to Advanced → Environment Variables.
- Under User variables, find Path → Edit → New.
- Paste: C:\pasteYourPathHere\poppler-23.10.0\bin

Restart terminal, verify instalation: pdftoppm -h

In [3]:
def load(pdf_path):
    pdf_image = convert_from_path(pdf_path)
    pages = list(pdf_image)
    print('Page Options:')
    for i in range(len(pages)):
        print(i,end = " ")
    print()
    return pages

#convert page to numpy array in order to apply filters
def retreive_page(pages, page_number):
    opencv_pageN = np.array(pages[page_number])
    opencv_pageN = cv2.cvtColor(opencv_pageN, cv2.COLOR_RGB2BGR)
    return opencv_pageN
    
#For a given pdf, replace the path. Then, select the page you want to extract
pdf_path = r"C:\Users\dkhun\UC Davis\AISC Github repository\BeginnerProjectFallQuarter2025\data\raw_pdfs\textbook_pdf_3_includes_diagrams.pdf"
pages = load(pdf_path)
opencv_pageN = retreive_page(pages,1);
pageN = Image.fromarray(opencv_pageN)
pageN.show()


Page Options:
0 1 2 3 4 5 6 7 8 9 10 11 


# Step 2: Image processing   
### A typical pipeline looks like this:

1) Grayscale conversion → Reduces to single channel.

2) Noise reduction (Gaussian or bilateral filter).

3) Morphological operations / small artifact removal → Fill gaps, remove small holes.

4) Sharpening / high-frequency mask → Enhances the text edges.

5) Deskew / rescale / thresholding → Final normalization for OCR.

In [5]:
def apply_grayscale(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return gray

def apply_bilateral(image):   #method of smoothing while preserving edges
    bilateral = cv2.bilateralFilter(image, d=9, sigmaColor=75, sigmaSpace=75)
    return bilateral
    #If overblurred, decrease sigmaColor and sigmaSpace (to 50?)

def apply_gaussian_blur(image):      #smooths image uniformly
    gaussian = cv2.GaussianBlur(image,(3,3),0)
    return gaussian

def apply_morphological_closing(image, ksize):
    kernel = np.ones((ksize, ksize), np.uint8)   #ksize is strength. Closing performs dilation then erosion
    closing = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
    return closing

def apply_dilation(image, ksize):
    kernel = np.ones((ksize, ksize), np.uint8)  # ksize controls strength
    dilated = cv2.dilate(image, kernel, iterations=1)
    return dilated
    
def apply_sharpen(image):   #try different sharpening strength?
    low_freq = apply_bilateral(image)
    high_freq = cv2.subtract(image, low_freq)
    sharpen_strength = 15
    sharpened = cv2.addWeighted(image, 1.0, high_freq, sharpen_strength, 0)
    return sharpened
    #Improve edge sharpness by subtracting low freq parts from original image
    #This leaves high frequency (edges)
    #Then, add high freq back to original to get sharper edges

def apply_binary(image):
    binary = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 5)
    #Adusting params: ... n,n) corresponds to ... neighborhood_averaging, threshold
    #Decreasing threshold makes more black
    #increasing neighborhood_averaging increases smoothing
    return binary
    #binary should be done last

def save_image(opencv_array, label):
    file_name = f'{label}.png'
    save_img = Image.fromarray(opencv_array)
    save_img.save(file_name)
    save_img.show()
    print(f'Saved: {file_name}')

def filter(image, label):
    filtered_image = apply_gaussian_blur(apply_binary(apply_grayscale(image)))
    save_image(filtered_image, label)
    return filtered_image

#Filters of choice:
gray_binary_gaussian = apply_gaussian_blur(apply_binary(apply_grayscale(opencv_pageN)))
#best combination so far has been: gray --> binary --> gaussian

test = filter(opencv_pageN,'save test')
pdf3_gray = apply_grayscale(opencv_pageN)
#pdf3_gaussian = apply_gaussian_blur(opencv_pageN)
pdf3_gray_binary = apply_binary(gray_binary_gaussian)

save_image(pdf3_gray,'pdf3 grayscale')
save_image(pdf3_gray_binary,'pdf3 binary')




Saved: save test.png
Saved: pdf3 grayscale.png
Saved: pdf3 binary.png


### Image is now filtered for clarity, but we want to remove any remaining diagrams now.  
Going to leave this alone for now  
Input to .ser,detectRegions is grayscale opencv file

In [None]:
img = gray_binary_gaussian     #this is the txtbk_pdf_6 doc that img processing was tested on

#the following is txtbk_pdf_3 which includes diagrams
img2_path = r"C:\Users\dkhun\UC Davis\AISC Github repository\BeginnerProjectFallQuarter2025\data\raw_pdfs\textbook_pdf_3_includes_diagrams.pdf"
pdf_image2 = convert_from_path(img2_path)
pages = list(pdf_image2)
print(len(pages))
opencv_page3 = np.array(pages[2])
opencv_page3 = cv2.cvtColor(opencv_page3, cv2.COLOR_RGB2BGR)
cv2.imshow('Page 3',opencv_page3)
cv2.waitKey(0)
cv2.destroyAllWindows()
#the third page of the pdf is now obtained
#this process should be streamlined into a function later
img2 = apply_binary(apply_grayscale(opencv_page3))     #applying binary filter is creating a ton of noise atm
#img2 = apply_gaussian_blur(apply_binary(apply_grayscale(opencv_page3)))
save_image(img2,'pdf_3_with_diagrams')
print(type(img2))



NameError: name 'gray_binary_gaussian' is not defined

In [None]:

def MSER(image):
    mser = cv2.MSER_create(delta=4, min_area = 50)
    regions, bboxes = mser.detectRegions(image)
    
    for box in bboxes:
        x, y, w, h = box
        cv2.rectangle(image, (x, y), (x + w, y + h), (0,255,0),1)
    
    cv2.imshow('MSER Regions', image)
   #cv2.resizeWindow('MSER Regions', 1500, 900)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

MSER(pdf3_gray)




# Monday 10/20 Meeting  
Want bounding boxes merged into lines. Each line is a specific png image  
- Use pandas dataframe to represent data (makes it more efficient)  
- Use pd.readcsv() ??  
- csv (excel) file has data for things like left_bounding, width, height, confidence, and text PER WORD BOX  
- Important: print(df.colums) says names of colums  
- Important: print(df.head(n)) gives all the data for n rows  
To access all the values in certain columns (ex: left bounding), use df_left = df['left_bounding'][][]  
- Or use df_column_names = df[['col1,'col2','coln']]  
For loops:  
- "for idx,row in df.iterrows() is used to iterate through the rows  
- Initialize right pixel as negative infinity  
- if idx=0: right_pixel = row['left'] + row['width']
     - right pixel = left bounding + width  
     - create threshold for if two words are close enough (sequential) and test vetical distance(same line)  
- Make else contained within the for loop: save segment of pdf (segment=boxed line of text), specify them by combined values

# To Do:   
1) Filtering: get text clear and remove images/diagrams  
- sharp edges for text  
2) Find and implement a library for bounding boxes  
3) write bandas logic to create single-line boxes (merging of individual boxes)