In [None]:
!pip install PyMuPDF Pillow

In [None]:
!sudo apt install tesseract-ocr-rus

In [None]:
!pip install pytesseract

In [14]:
!python -m spacy download ru_core_news_sm

In [None]:
!pip install pandas

In [16]:
import fitz # PyMuPDF
import io
from PIL import Image
import re
import pytesseract
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import sys

In [17]:
class DocInfo:

  """This class gets all PDF info"""

  def __init__(self, filename):
    self.filename = fitz.open(filename)
    self.text = []
    self.images = []
    self.other_pages = []
    self.date = []
    self.disc_from_img = []


  def GetText(self):

    """ Written info """

    for page_index in range(len(self.filename)):

      page = self.filename.load_page(page_index)
      page_to_text = page.get_text("text")

      self.text.append(page_to_text)

    return self.text


  def CheckOtherPages(self, text):

    """Looks for text on other pages"""

    if len(self.text) > 1:
      self.other_pages = self.text[1:]
    else:
      self.other_pages = " ".join(self.text)

    return self.other_pages


  def GetImages(self):

    """Grabs images"""
    #Searches for images in the PDF
    for page_index in range(len(self.filename)):
      page = self.filename[page_index]
      image_list = page.get_images()

      for image_index, img in enumerate(image_list, start=1):

          xref = img[0]
          base_image = self.filename.extract_image(xref)
          image_bytes = base_image["image"]
          image_ext = base_image["ext"]
          image = Image.open(io.BytesIO(image_bytes))
          image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))

          self.images.append(f"image{page_index+1}_{image_index}.{image_ext}")


  def GetDate(self, images):

    """Finds date and number of the complaint"""

    image_path_in_colab='./'+ str(self.images[0])
    custom_config = r'-l eng+rus --psm 6'
    extractedInformation = pytesseract.image_to_string(Image.open(image_path_in_colab),config=custom_config)
    self.date = (extractedInformation.strip('\n\x0c').split(' '))

    return self.date


  def GetInfoFromPages(self, images):

    """Looks for text on images"""

    pic = []
    for img in self.images:
      pict = re.findall('image2', str(img))
      if len(pict)>= 1:
        pic.append(img)


    if len(pic)>= 1:
      for i, p in enumerate(pic):
        image_path_in_colab='./'+ str(pic[i])
        custom_config = r'-l eng+rus --psm 6'
        extractedInformation = pytesseract.image_to_string(Image.open(image_path_in_colab),config=custom_config)
        image = re.sub("\\n", "", extractedInformation)

        self.disc_from_img.append(image)

    else:
       self.disc_from_img.append("Во вложении нет описания проблемы")

    return self.disc_from_img


  def AllInfo(self, text_disc, img_disc):

    all_info = text_disc + img_disc

    return ' '.join(all_info)

In [18]:
class AnalyzeText:

  def __init__(self, disciption):
    self.disciption = disciption
    self.spacy = spacy.load("ru_core_news_sm")

  def CommonWords(self):
    clear_text = re.findall('[аА-яЯ]{5,}', self.disciption)
    doc = self.spacy(" ".join(clear_text))



    vectorizer = TfidfVectorizer()
    filtered_tokens = [token.text for token in doc if token.pos_ in ['NOUN', 'VERB']]
    tfidf = vectorizer.fit_transform(filtered_tokens)

    top_phrases = sorted(vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=False)[:5]

    return top_phrases

In [19]:
def create_dict(file):

  info = DocInfo(f"{file}")
  text = info.GetText()
  other_pages_text = info.CheckOtherPages(text)
  all_imgs = info.GetImages()
  date = info.GetDate(all_imgs)
  disc_img_text = info.GetInfoFromPages(all_imgs)

  all_text = info.AllInfo(other_pages_text, disc_img_text)
  most_common_words = AnalyzeText(all_text).CommonWords()


  info2dict = {}

  info2dict['Дата'] = date[0]
  info2dict['Номер_жалобы'] = date[1]
  info2dict['Организация'] = ''.join(re.findall("(?<=ООО\s«)[аА-яЯ]+.[аА-яЯ]+", ''.join(text)))
  info2dict['Кому'] = ''.join(re.findall("(?<=\n)\w.{1}\w.{1}[\-\s][А-ЯЁ][а-яё]+",''.join(text)))
  info2dict['email'] = ''.join(re.findall("(?<=\n)([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", ' '.join(text)))
  info2dict['Исполнитель'] = ''.join(re.findall("(?<=Исп.)\W.+", ' '.join(text)))
  info2dict['email_исполнителя'] = ''.join(re.findall("([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)(?=\.)", ' '.join(text)))
  info2dict['маршруты'] = ' '.join(re.findall("№\s[0-9]+", ' '.join(text)))
  info2dict['Ключевые слова'] = ', '.join(most_common_words)

  return info2dict

In [36]:
class CollectInfo:

  def __init__(self):
    self.user_input = input('Укажите путь к жалобе ')
    self.all = []


  def CheckInp(self):
    while True:
     if "pdf" in self.user_input:
      file = create_dict(self.user_input)
      self.all.append(file)
      check = input('Введите Y/N если хотите продолжить или остановить запись ')
      if check.lower() == "y":
        self.user_input = input('Укажите путь к жалобе ')
        continue
      if check.lower() == "n":
        df = pd.DataFrame(self.all)
        df.to_excel('complains.xlsx', sheet_name='info', index=True)
        print('Запись завершена ')
        break



In [39]:
CollectInfo().CheckInp()

Укажите путь к жалобе /content/6464.pdf
Введите Y/N если хотите продолжить или остановить запись n
Запись завершена 
