In [1]:
import json
import os
import fitz
import re
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate
from langdetect import detect

GOOGLE_API_KEY = <GOOGLE_API_KEY>
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

In [2]:
from langchain.output_parsers import ResponseSchema

In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
            #print(text)
    return text

In [4]:
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    #print(len(cleaned_text))
    return cleaned_text

In [5]:
response_schemas = [
    ResponseSchema(name="Language", description="Language used in the article"),
    ResponseSchema(name="Length", description="Length of the article in words"),
    ResponseSchema(name="Grade_Level", description="Grade level of the content"),
    ResponseSchema(name="Discipline", description="Discipline or subject of the article"),
    ResponseSchema(name="Blooms_Level", description="Bloom's taxonomy level of the content")
]

In [6]:
# Create the output parser
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [7]:
example = '''{
    "Language": "language used",
    "Length": "length of the article in words",
    "Grade_Level": "grade level",
    "Discipline": "discipline",
    "Blooms_Level": "bloom's level"
}'''

In [8]:
template = """\
You are an educational curriculum expert system tasked to find specific information for the article. 
Identify the following items from the review text:
- Language used in the article
- Length of the article in words
- Grade level of the content (e.g., 10th grade, 12th grade, University)
- Discipline or subject of the article
- Bloom's taxonomy level of the content

The review text is delimited with triple backticks.

Take your time to analyze the review text and find the required information.
Output the response in the desired format as given in {example} but don't use the same value.

Make your response as short as possible.
Make sure to answer in the correct format.
Review text: ```{article}```

{format_instructions}

If the information isn't present, use 'unknown' as the value, not null or None.
"""

In [9]:
prompt = ChatPromptTemplate.from_template(template=template)

In [11]:
chain = prompt | llm

In [12]:
chain

ChatPromptTemplate(input_variables=['article', 'example', 'format_instructions'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['article', 'example', 'format_instructions'], template="You are an educational curriculum expert system tasked to find specific information for the article. \nIdentify the following items from the review text:\n- Language used in the article\n- Length of the article in words\n- Grade level of the content (e.g., 10th grade, 12th grade, University)\n- Discipline or subject of the article\n- Bloom's taxonomy level of the content\n\nThe review text is delimited with triple backticks.\n\nTake your time to analyze the review text and find the required information.\nOutput the response in the desired format as given in {example} but don't use the same value.\n\nMake your response as short as possible.\nMake sure to answer in the correct format.\nReview text: ```{article}```\n\n{format_instructions}\n\nIf the information isn't present, us

In [15]:
text = extract_text_from_pdf(r"C:\Users\ual-laptop\Downloads\resources\Chapter\10-Sc-Acid-Bases-and-Salts-Notes.pdf")
cleaned_text = clean_text(text)

In [16]:
inputs = {
    "article": cleaned_text,
    "example": example,
    "format_instructions": format_instructions
}

In [17]:
response = chain.invoke(inputs)

In [18]:
response

AIMessage(content='```json\n{\n\t"Language": "English",\n\t"Length": "1336",\n\t"Grade_Level": "10th grade",\n\t"Discipline": "Chemistry",\n\t"Blooms_Level": "Remembering"\n}\n```', response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}]}, id='run-4ad37c4f-fb5d-4731-afa0-87531156be2c-0')

In [19]:
json_string = response.content.strip('```json\n').strip('\n```')

In [20]:
data = json.loads(json_string)

In [21]:
data

{'Language': 'English',
 'Length': '1336',
 'Grade_Level': '10th grade',
 'Discipline': 'Chemistry',
 'Blooms_Level': 'Remembering'}

In [22]:
language = data.get("Language")
length = data.get("Length")
grade_level = data.get("Grade_Level")
discipline = data.get("Discipline")
blooms_level = data.get("Blooms_Level")
#Learning_Objectives = data.get("Learning_Objectives")
#Learning_Facets = data.get("Learning_Facets")

In [23]:
print(f"Language: {language}")
print(f"Length: {length}")
print(f"Grade Level: {grade_level}")
print(f"Discipline: {discipline}")
print(f"Blooms Level: {blooms_level}")
#print(f"Learning_Objectives: {Learning_Objectives}")
#print(f"Learning_Facets: {Learning_Facets}")

Language: English
Length: 1336
Grade Level: 10th grade
Discipline: Chemistry
Blooms Level: Remembering
