In [2]:
import cv2
import numpy as np
import pandas as pd
import pytesseract as tess
from matplotlib import pyplot as plt

In [3]:
import Levenshtein as lev
from skimage.filters.rank import entropy
from pytesseract import Output

In [4]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/indra25/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def getCoordinates(row):
    left, top, width, height = row['left'], row['top'], row['width'], row['height']
    coord = (int(left + (width / 2)), int(top + (height / 2)))
    
    return coord

In [6]:
def getNeighbors(word, img, df , window_size):
    y_coord,x_coord = df[df['text'] == word].iloc[0][['y_coord','x_coord']]
    c1 = df['x_coord'].isin(range(x_coord-window_size,x_coord+window_size))
    c2 = df['y_coord'].isin(range(y_coord-window_size,y_coord+window_size))
    
    return list(df[c1 & c2].text)

In [7]:
def plot_patch(word, img, df , window_size):
    y_coord,x_coord, neighbors = df[df['text'] == word].iloc[0][['y_coord','x_coord','neighbors']]    
    patch_img = img[x_coord-window_size:x_coord+window_size,y_coord-window_size:y_coord+window_size]
    plt.imshow(patch_img)

In [8]:
def GenerateBigramTokens(img,min_conf=0,min_word_len=3,window=200):
    data = tess.image_to_data(img, output_type=tess.Output.DICT)
    df = pd.DataFrame(data)
    
    df.dropna(subset=['text'],inplace=True)
    
    filt = (df['conf'] > min_conf) & (df['text'].str.len() > min_word_len)
    idx = df[filt]['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])).index
    df = df.iloc[idx]
    df.reset_index(inplace=True, drop=True)
    
    df[['y_coord','x_coord']] = df.apply(lambda x: getCoordinates(x),axis=1,result_type='expand')
    df['neighbors'] = df.apply(lambda x: getNeighbors(x.loc['text'],imgPDF,df,window),axis=1)

    TokenPairs = [(row['text'], item) for _, row in df.iterrows() for item in row['neighbors']]
    TokenPairs = [(x, y) for x, y in TokenPairs if x != y]    
    
    return set(TokenPairs)

In [9]:
imgPDF = cv2.imread('./../../samples/imgPDF.png')
imgPDF.shape

(1770, 1270, 3)

In [10]:
OriginalTokenPairs = GenerateBigramTokens(imgPDF)

In [11]:
img1 = cv2.imread('./../../samples/img3.jpeg')
img1.shape

(1280, 960, 3)

In [12]:
CapturedTokenPairs1 = GenerateBigramTokens(img1)

In [13]:
c = set( OriginalTokenPairs).intersection(set(CapturedTokenPairs1))
len(c),c

(2201,
 {('Figure', 'during'),
  ('solubility', 'limit'),
  ('data', 'Baseline'),
  ('levofloxacin,', 'biofilm'),
  ('showed', 'experienced'),
  ('taken', 'against'),
  ('rifampin', 'increase'),
  ('determining', 'Efforts'),
  ('Phenotype', 'antimicrobial'),
  ('biofilms', 'during'),
  ('planktonic', 'between'),
  ('attributed', 'biofilms'),
  ('reduction', 'aBL.'),
  ('shows', 'each'),
  ('while', 'determined'),
  ('substantial', '(2.4-4.8'),
  ('mg/mL', 'reduction'),
  ('while', 'controls'),
  ('each', 'reduction'),
  ('trend', 'different'),
  ('mg/ml.', 'treatments.'),
  ('biofilm', 'between'),
  ('trend', 'during'),
  ('decreases', 'between'),
  ('nafcillin', 'effects'),
  ('between', 'showed'),
  ('against', 'biofilm'),
  ('contrast,', 'Effect'),
  ('limit', 'dose'),
  ('while', 'trend'),
  ('data', 'experienced'),
  ('Percutaneous', 'antibiotic'),
  ('dosage', 'attributed'),
  ('experienced', 'side'),
  ('against', 'levofloxacin,'),
  ('when', '(2.4-4.8'),
  ('pattern,', 'Both'),

In [None]:
def apply_rules(word):
    rules = {
        'ceoCDQ0': 'c',
        '7Zz': 'z',
        'YVyv': 'v',
        'IijJLl1': 'i',
        '9gq': 'g',
        'li': 'h',
        'f,t': 'f',
        '68': '6',
        'BEF': 'B',
        'ri': 'n',
        'tn': 'm',
        'rn': 'm',
        'nn': 'm',
        'cl': 'd',
        'el': 'd'
    }
    
    for rule, replacement in rules.items():
        word = word.replace(rule, replacement)
    return word