# Lecture Preprocessing

## Imports

In [None]:
from pdf2image import convert_from_path
from PIL import Image
import math
import matplotlib.pyplot as plt
import openai
import fitz  # PyMuPDF
import io
import pytesseract
import base64
import requests
from io import BytesIO

from langdetect import detect
import PyPDF2
from tqdm import tqdm
import configparser
from openai import OpenAI
import tiktoken

## Key and path variables



In [None]:
# OpenAI API Key
config = configparser.ConfigParser()
config.read('../Edu_Llama/config.ini')
api_key = config['openai']['api_key']

In [None]:

# Pfad zur PDF-Datei
pdf = 'ML_fundamentals'
pdf_path = f'C:/Users/a829727/OneDrive - Atos/Dokumente/Uni/Semester 5/NLP/Vorlesungen/{pdf}.pdf'



# Picture control

In [None]:
def extract_and_plot_images(pdf_path):
    # PDF-Dokument öffnen
    doc = fitz.open(pdf_path)
    # Ermittle die Anzahl der Seiten
    num_pages = len(doc)
    images=[]
    pages=[]
    # Durch jede Seite des PDFs gehen
    for i in range(428,len(doc)):
        page = doc.load_page(i)

        # Bilder auf der Seite extrahieren
        for img in page.get_images(full=True):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Bild mit PIL verarbeiten
            image = Image.open(io.BytesIO(image_bytes))
            images.append(image)
            # Bild anzeigen
            plt.imshow(image)
            print(f"Seite: {i+1}")
            pages.append(i+1)
            plt.axis('off')
            plt.show()
    pages = list(set(pages))
    doc.close()
    return images,pages,num_pages
    	
images,pages,num_pages = extract_and_plot_images(pdf_path)

## Funktions

### convert to base64

In [None]:
# Funktion, um das PIL-Bild zu kodieren
def encode_pil_image(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

### Extracting the text

In [None]:
def extract_text_from_range(file_path, start_page, end_page,OCR = ''):
    if OCR == 'OCR':
        extracted_texts = []
        for image_num in tqdm(range(start_page+1, end_page), desc='Texextraktion_OCR'):
            images = convert_from_path(file_path, first_page=image_num, last_page=image_num)
            extracted_text = pytesseract.image_to_string(images[0])
            extracted_texts.append(extracted_text)
        extracted_text = " ".join(extracted_texts)

    else:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            extracted_text = ''

            # Anzahl der Seiten in der PDF-Datei bestimmen
            num_pages = len(pdf_reader.pages)

            # Überprüfung, ob die Seitenzahlen im gültigen Bereich liegen
            if start_page < 0 or end_page > num_pages or start_page > end_page:
                return "Ungültiger Seitenbereich."

            for page_number in tqdm(range(start_page, end_page), desc='Texextraktion'):
                page = pdf_reader.pages[page_number]
                page_text = page.extract_text()
                if page_text:
                    extracted_text += page_text + '\n'
                else:
                    extracted_text += f'Kein Text auf Seite {page_number + 1}\n'
        
    return extracted_text

### Extracting Images

In [None]:

def extract_images_from_pdf(file_path, page_number):
    print("Extrahiere Bilder...")
    images = convert_from_path(file_path, first_page=page_number, last_page=page_number)
    
    bild=images[0]
    if bild.size[0]>=600:
        skale=600
    else:
        skale=bild.size[0]
    # Neue Größe definieren 
    neue_groesse = (skale, int((bild.size[1]/bild.size[0])*skale))

    # Bild skalieren
    skaliertes_bild = bild.resize(neue_groesse)

    return skaliertes_bild

### Picture to text

In [None]:
# Getting the base64 string
def image_to_text(pil_image):
  print("Image to text")
  base64_image = encode_pil_image(pil_image)
  headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
  }

  payload = {
    "model": "gpt-4-vision-preview",
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": "Beschreibe den inhalt des Bildes?"
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{base64_image}"
            }
          }
        ]
      }
    ],
    "max_tokens": 300
  }

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

  return response.json()['choices'][0]['message']['content']

### Final pipeline

In [None]:
def pre_process_pipeline(file_path, pages, num_pages,OCR = ''):
    final_text = ''
    page_old = 0

    if len(pages) == 0:
        final_text = extract_text_from_range(file_path, 0, num_pages,OCR = OCR)
    else:
        for page in tqdm(pages, desc='Processing Pages'):
            liste = list(range(num_pages))[page_old:page]
            if page_old != page + 1:
                text = extract_text_from_range(file_path, liste[0], liste[-1],OCR = OCR)
                final_text += text
            image = extract_images_from_pdf(file_path, page)
            img_text = image_to_text(image)
            final_text += f'Bild:{img_text}'
            page_old = page
            if page == pages[-1] and page != num_pages:
                text = extract_text_from_range(file_path, page+1, num_pages)
                final_text += text
    
    return final_text

## Final Preprocessing

In [None]:
text = pre_process_pipeline(pdf_path,pages,num_pages,'OCR')

# Save data

In [None]:
# save the preprocessed lecture text
with open(f'../../{pdf}.txt', 'w') as file:
    file.write(text)

## Aufsplitten des Codes

In [None]:
# Öffnen der Datei im Lesemodus
with open(f'../Vorlesungen/{pdf}.txt', 'r') as file:
    # Lesen des gesamten Inhalts der Datei
    content = file.read()

In [None]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
num_token_content=num_tokens_from_string(content, 'gpt-3.5-turbo')
n=rounded_up = math.ceil(num_token_content/1000)

In [None]:
def split_sentences_into_equal_parts(content, n):
    # Split the text into sentences
    sentences = content.split('. ')
    
    # Number of sentences
    total_sentences = len(sentences)

    # Calculate the number of sentences per part
    sentences_per_part = max(1, total_sentences // n)

    # Initialize variables
    parts = []
    current_part = []

    # Iterate over sentences and distribute them into parts
    for i, sentence in enumerate(sentences):
        current_part.append(sentence)
        
        # Check if current part is full or it's the last sentence
        if len(current_part) == sentences_per_part or i == total_sentences - 1:
            # Join sentences and add to parts
            parts.append(' '.join(current_part))
            current_part = []

    return parts


In [None]:
string_parts = split_sentences_into_equal_parts(content, n)

In [None]:
len(string_parts)

## Translation to English

In [None]:
def chat_with_gpt(prompt):
    client = OpenAI(api_key=api_key,)

    chat_completion = client.chat.completions.create(
        messages=[{"role": "system", "content": "You are a helpful assistant.Translate all the texts you get in english language and write all formulas in latex syntax."},
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content

In [None]:
full_text_eng=[]
for string in tqdm(string_parts, desc='Chatting'):
    response = chat_with_gpt(string)
    with open(f'../Vorlesungen/{pdf}_eng.txt', "a") as file:
        file.write(response)
    print(response)

In [None]:
with open(f'../Vorlesungen/{pdf}_eng.txt', 'w') as file:
    file.write(full_text_eng)