In [37]:
import cv2
import pytesseract
from docx import Document

In [39]:
def preprocess_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    eroded = cv2.erode(opening, kernel, iterations=1)
    preprocessed = cv2.GaussianBlur(eroded, (5, 5), 0)
    return preprocessed

In [41]:
def ocr(image):
    text = pytesseract.image_to_string(image, config='--psm 1')  # Adjust PSM value as needed
    return text


In [43]:
def save_to_doc(text, output_file):
    doc = Document()
    doc.add_paragraph(text)
    doc.save(output_file)

In [45]:
def main():
    image_path = "try.jpg"  # Adjust this path accordingly
    frame = cv2.imread(image_path)
    if frame is None:
        print(f"Failed to read image at path: {image_path}")
        return
    
    preprocessed_image = preprocess_image(frame)
    recognized_text = ocr(preprocessed_image)
    output_file = "output.docx"
    save_to_doc(recognized_text, output_file)
    print(f"Text saved to {output_file}")

if __name__ == "__main__":
    main()

Text saved to output.docx
