In [11]:
from dotenv import load_dotenv
import os
import openai
import shutil
import PyPDF2
import openai
from pydantic import BaseModel, ValidationError, constr
from colorama import Fore, Style, init

load_dotenv(override=True)
OPENAPI_API_KEY = os.getenv("OPENAPI_API_KEY")
client = openai.Client()

class TitleModel(BaseModel):
    title: constr(min_length=1, max_length=100)

In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def get_pdf_title(pdf_text):
    # Use OpenAI Completion to extract the title
    response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': f'Extract the title from the following document:\n\n{pdf_text[:2000]}\n\n Do not use only capital letters and do not include and slashes forward backward etc. Make sure the title can be used as a filename. Keep it short and concise but clear e.g. not only operating manual or annual report. Title should not be longer than 50 characters. do not use acronyms\n\nTitle:'}
    ],
    max_tokens=150
    )
    title = response.choices[0].message.content
    return title

def validate_title(title):
    try:
        validated_title = TitleModel(title=title)
        return validated_title.title
    except ValidationError as e:
        print(f"Invalid title: {e}")
        return None

def rename_and_move_pdf(folder_path, renamed_folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            try:
                file_path = os.path.join(folder_path, filename)
                pdf_text = extract_text_from_pdf(file_path)
                title = get_pdf_title(pdf_text)
                
                valid_title = validate_title(title)
                if valid_title:
                    new_file_path = os.path.join(renamed_folder_path, f"{valid_title}.pdf")
                    
                    # Rename and move the file to the new folder
                    shutil.move(file_path, new_file_path)
                    print(f"Renamed and moved {filename} to {valid_title}.pdf")
                else:
                    print(f"Skipping renaming for {filename} due to invalid title.")
            except Exception as e:
                print(f"{Fore.RED}An error occurred with {filename}: {e}{Style.RESET_ALL}")
                # Continue to the next file

# Folder paths
folder_path = './papers'  # Folder containing the PDFs
renamed_folder_path = './papers/renamed'  # Folder to move renamed PDFs to

# Create the destination folder if it doesn't exist
os.makedirs(renamed_folder_path, exist_ok=True)

# Process PDFs and continue on error
try:
    rename_and_move_pdf(folder_path, renamed_folder_path)
except Exception as e:
    print(f"{Fore.RED}An error occurred: {e}{Style.RESET_ALL}")