In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import openai
import json
import os
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage
from langchain import LLMChain
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate
import fitz  # PyMuPDF
import re
import time

In [4]:
"""openai_api_base=<openai_api_base>
openai_api_version=<openai_api_version>
deployment_name=<deployment_name>
openai_api_key = <openai_api_key>
openai_api_type=<openai_api_type>
"""

'openai_api_base=<openai_api_base>\nopenai_api_version=<openai_api_version>\ndeployment_name=<deployment_name>\nopenai_api_key = <openai_api_key>\nopenai_api_type=<openai_api_type>\n'

In [5]:
llm = AzureChatOpenAI(
    openai_api_base=openai_api_base,
    openai_api_version=openai_api_version,
    deployment_name=deployment_name,
    openai_api_key=openai_api_key,
    openai_api_type=openai_api_type,
)

  warn_deprecated(


In [6]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


response_schemas = [
    ResponseSchema(name="Language", description="Language used in the article"),
    ResponseSchema(name="Length", description="Length of the article in words"),
    ResponseSchema(name="Grade_Level", description="Grade level of the content"),
    ResponseSchema(name="Discipline", description="Discipline or subject of the article"),
    ResponseSchema(name="Blooms_Level", description="Bloom's taxonomy level of the content"),
    ResponseSchema(name="Learning_Objectives", description="Learning objectives covered in the article"),
    ResponseSchema(name="Learning_Facets", description="Learning facets addressed in the article")
]


output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()


example = '''{
    "Language": "English",
    "Length": "1500 words",
    "Grade_Level": "High School",
    "Discipline": "Chemistry",
    "Blooms_Level": "Application",
    "Learning_Objectives": "Understand acid-base theories",
    "Learning_Facets": "Experimental analysis, problem-solving"
}'''


template = """\
You are an educational curriculum expert system tasked to find specific information for the article. 
Identify the following items from the review text:
- Language used in the article
- Length of the article in words
- Grade level of the content (e.g., 10th grade, 12th grade, University)
- Discipline or subject of the article
- Bloom's taxonomy level of the content
- Learning objectives covered in the article
- Learning facets addressed in the article

The review text is delimited with triple backticks.

Take your time to analyze the review text and find the required information.
Output the response in the desired format as given in {example} but don't use the same value.

Make your response as short as possible.
Make sure to answer in the correct format.
Review text: ```{article}```

{format_instructions}

If the information isn't present retry again two times and then use 'unknown' as the value, not null or None.
"""


prompt = ChatPromptTemplate.from_template(template=template)

def process_pdfs_in_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                print(f"Processing file: {pdf_path}")
                try:
                    # Extract and clean text from the PDF
                    article_text = extract_text_from_pdf(pdf_path)
                    article_text = clean_text(article_text)
                    
                    # Use the LLMChain to process the prompt
                    chain = prompt | llm
                    inputs = {
                        "article": article_text,
                        "example": example,
                        "format_instructions": format_instructions
                    }
                    response = chain.invoke(inputs)
                    json_string = response.content.strip('```json\n').strip('\n```')
                    data = json.loads(json_string)
                    language = data.get("Language")
                    length = data.get("Length")
                    grade_level = data.get("Grade_Level")
                    discipline = data.get("Discipline")
                    blooms_level = data.get("Blooms_Level")
                    Learning_Objectives = data.get("Learning_Objectives")
                    Learning_Facets = data.get("Learning_Facets")
                    print(f"Length: {length}")
                    print(f"Grade Level: {grade_level}")
                    print(f"Discipline: {discipline}")
                    print(f"Blooms Level: {blooms_level}")
                    print(f"Learning_Objectives: {Learning_Objectives}")
                    print(f"Learning_Facets: {Learning_Facets}")
                    print("----------------------------------------------------------------------------------------")
                except Exception as e:
                    print(f"Error processing {pdf_path}: {e}")
                    print("----------------------------------------------------------------------------------------")
                time.sleep(3)
                
main_directory = r"C:\Users\ual-laptop\Downloads\resources"
process_pdfs_in_directory(main_directory)

Processing file: C:\Users\ual-laptop\Downloads\resources\Article\2024-02-genome-snakefly-evolutionary-history.pdf
Length: 800 words
Grade Level: High School
Discipline: Biology
Blooms Level: Analysis
Learning_Objectives: Understand the evolutionary history of snakeflies
Learning_Facets: Genomic analysis, phylogenetic analysis
----------------------------------------------------------------------------------------
Processing file: C:\Users\ual-laptop\Downloads\resources\Article\2024-03-magnetic-avalanche-triggered-quantum-effects.pdf
Length: 700 words
Grade Level: High School
Discipline: Physics
Blooms Level: Application
Learning_Objectives: Understand quantum Barkhausen noise
Learning_Facets: Experimental analysis, quantum mechanics
----------------------------------------------------------------------------------------
Processing file: C:\Users\ual-laptop\Downloads\resources\Article\2024-03-reveal-evolutionary-path-important-proteins.pdf
Length: 550 words
Grade Level: High School
Disc