In [1]:
!pip install transformers
!pip install PyMuPDF
!pip install langdetect
!pip install rake-nltk
!pip install sentence-transformers
!pip install langchain
!pip install PyPDF2

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.15-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.15 PyMuPDFb-1.23.9
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langd

In [4]:
import torch

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
from transformers import BartTokenizer, BartForConditionalGeneration

# load bart tokenizer and model from huggingface
tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa').to(device)


In [32]:
import logging as logger
import math
import os
import sqlite3
from pprint import pprint

import fitz
import matplotlib.pyplot as plt
import numpy as np
from langdetect import detect
from rake_nltk import Rake
from scipy.signal import argrelextrema
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import nltk
nltk.download(['stopwords', 'punkt'])
from transformers import AutoTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')


logger.basicConfig(level=logger.INFO)
# TODO: implement function to get matched documents based in user prompt
# TODO: add logging to file
# TODO: check for syntax and formatting
# TODO: add comments
# TODO: implement method to preprocess also .txt files (currently only pdf, compare line 35)
# TODO: fix the bug that the filename is currently not written to db (compare line 288)


class FileProcessor:
    def __init__(self, file):
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        logger.info("Initializing FileProcessor")
        self.document = None
        # read in pdf document
        try:
            file_content = file.getvalue()
            self.document = fitz.open("pdf", file_content)
            logger.info("PDF document loaded")
        except Exception as e:
            logger.error(f"Could not open PDF file: {e}")
        self.document_content = {"text": str(), "images": dict()}

    @staticmethod
    def extract_text(
        document: fitz.fitz.Document, extract_text_from_image: bool = False
    ) -> str:
        """extract text from a pdf page
        Args:
            page (fitz.fitz.Page): page from pdf document
            extract_from_image (bool, optional): extract text from image using OCR (not possible yet). Defaults to False.
        Returns:
            str: text from page
        """

        if extract_text_from_image:
            logger.info("Extracting text from image")
            pass

        else:
            logger.info("Extracting text from pages")
            text = str()
            for page in document:
                try:
                    text += page.get_text() + " "
                except Exception as e:
                    print(f"Error processing file on page {page.number}: {e}")
                    continue
            return text
    @staticmethod
    def calc_rev_sigmoid(x: float) -> float:
        """calculate reverse sigmoid function

        Args:
            x (float): input value

        Returns:
            float: output value
        """
        return 1 / (1 + math.exp(0.5 * x))

    @staticmethod
    def activate_similarities(similarities: np.array, p_size=10) -> np.array:
        """calculate activated similarities using reverse sigmoid function

        Args:
            similarities (np.array): similarities between sentences
            p_size (int, optional): size of sigmoid function. Defaults to 10.

        Returns:
            np.array: activated similarities
        """
        x = np.linspace(-10, 10, p_size)
        y = np.vectorize(FileProcessor.calc_rev_sigmoid)
        activation_weights = np.pad(y(x), (0, similarities.shape[0] - p_size))
        diagonals = [
            similarities.diagonal(each) for each in range(0, similarities.shape[0])
        ]
        diagonals = [
            np.pad(each, (0, similarities.shape[0] - len(each))) for each in diagonals
        ]
        diagonals = np.stack(diagonals)
        diagonals = diagonals * activation_weights.reshape(-1, 1)
        activated_similarities = np.sum(diagonals, axis=0)
        return activated_similarities


    def split_text_into_chunks(
      self,
      text: str,
      filename: str,
      visualize_splitting: bool = False) -> list:
      doc_lang = detect(text)
      stop_words = set(stopwords.words('german')) if doc_lang == "de" else set(stopwords.words('english'))

      rake = Rake(stopwords=stop_words)

      logger.info("Loading model")
      model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
      sentences = text.split(". ")

      logger.info("Encoding sentences")
      embeddings = model.encode(sentences)
      similarities = cosine_similarity(embeddings, embeddings)
      logger.info("Calculating activated similarities")
      activated_similarities = self.activate_similarities(similarities, p_size=5)
      minima = argrelextrema(activated_similarities, np.less, order=2)
      split_points = [each for each in minima[0]]

      if visualize_splitting:
          self.plot_chunk_points(activated_similarities, split_points)

      logger.info("Creating chunks list")
      chunks = []
      text_chunk = str()
      for split_point, sentence in enumerate(sentences):
          text_chunk += sentence + ". "
          if split_point in split_points:
              rake.extract_keywords_from_text(text_chunk)
              extracted_keywords = rake.get_ranked_phrases()[:5]
              chunks.append(text_chunk)
              text_chunk = str()
      if text_chunk != str():
          rake.extract_keywords_from_text(text_chunk)
          chunks.append(text_chunk)

      final_chunks = []
      for chunk in chunks:
            if len(self.tokenizer.tokenize(chunk)) > 512:
                final_chunks.extend(self.divide_and_conquer(chunk))
            else:
                final_chunks.append(chunk)

      return final_chunks


    def divide_and_conquer(self, chunk: str) -> list:
      sentences = nltk.sent_tokenize(chunk)
      new_chunk = ""
      sub_chunks = []

      for sentence in sentences:
          # Wenn der aktuelle Satz zu lang ist, aufteilen
          if len(self.tokenizer.tokenize(sentence)) > 512:
              # Teilen des langen Satz in kleinere Teile
              sub_chunks.extend(self.divide_long_sentence(sentence))
          elif len(self.tokenizer.tokenize(new_chunk + sentence)) > 512:
              sub_chunks.append(new_chunk.strip())
              new_chunk = sentence
          else:
              new_chunk += " " + sentence

      if new_chunk:
          sub_chunks.append(new_chunk.strip())

      return sub_chunks

    def divide_long_sentence(self, sentence: str) -> list:
      words = sentence.split()
      sub_sentences = []
      current_chunk = []

      for word in words:
          current_chunk.append(word)
          # Prüfen, ob die Länge des aktuellen Chunks die Grenze erreicht hat
          if len(self.tokenizer.tokenize(' '.join(current_chunk) + ' ' + word)) > 512:
              sub_sentences.append(' '.join(current_chunk))
              current_chunk = []

      # Fügen letzten Chunk hinzu, falls vorhanden
      if current_chunk:
          sub_sentences.append(' '.join(current_chunk))

      return sub_sentences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
# PDF-Dokument öffnen
with fitz.open('Databases - 10-IndexingHashing.pdf') as doc:
  processor = FileProcessor(doc)
  text = processor.extract_text(doc)

ERROR:root:Could not open PDF file: 'Document' object has no attribute 'getvalue'


In [34]:
filename = 'Databases - 10-IndexingHashing.pdf'
chunks= processor.split_text_into_chunks(text, filename)

Token indices sequence length is longer than the specified maximum sequence length for this model (3865 > 512). Running this sequence through the model will result in indexing errors


In [35]:
def format_query(query, context):
    # contcatinate the query and context passages
    query = f"question: {query} context: {context}"
    return query

In [36]:
query=format_query("What kind of secondary indexes exist?", chunks[5])
pprint(query)

('question: What kind of secondary indexes exist? context: A secondary index '
 'which is created on a key (unique) ﬁeld\n'
 '(secondary key) has one index entry for each record\n'
 '–\xa0represents dense index and has pointer to the block of the record or\n'
 'to the record itself\n'
 '–\xa0this corresponds to any UNIQUE key attribute\n'
 '2. A secondary index which is created on a non-key ﬁeld can have for\n'
 'each index entry (indexing ﬁeld) more corresponding records in the\n'
 'data ﬁle\n'
 '–\xa0pointer of the index entry has address of the location of the block\n'
 'with record pointers to the actual records in the data ﬁle\n'
 '–\xa0represents sparse secondary index\n'
 '17\n'
 ' Secondary index on a key ﬁeld\n'
 'Secondary index on a key ﬁeld\n'
 'Dense secondary index on a secondary key attribute\n'
 '18\n'
 ' Secondary index on a non-key ﬁeld\n'
 'Secondary index on a non-key ﬁeld\n'
 'Sparse secondary index on a non-key attribute\n'
 '19\n'
 ' Searching for a record in ﬁle

In [37]:
def generate_answer(query):
    # tokenize the query to get input_ids
    inputs = tokenizer([query], max_length=1024, return_tensors="pt").to(device)
    # use generator to predict output ids
    ids = generator.generate(inputs["input_ids"], num_beams=2, min_length=20, max_length=40)
    # use tokenizer to decode the output ids
    answer = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return pprint(answer)

In [38]:
generate_answer(query)

('There are two kinds of secondary indexes. The first is a "dense" secondary '
 'index, where each record has its own index entry. The second is a "sparse" '
 'secondary index')


In [None]:
# Auch wenn BART bis zu 1024 Tokens verarbeiten kann, funktioniert es zumindest bei den meisten zufaöölig ausgewählten Fragen besser auf 512 Tokens