In [1]:
#!pip install langchain_community

In [2]:
import json
import os
import fitz
import re
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate
from langdetect import detect
from langchain_community.llms import Ollama


llm = Ollama(model="llama2")

In [3]:
from langchain.output_parsers import ResponseSchema

In [4]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
            #print(text)
    return text

In [5]:
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    #print(len(cleaned_text))
    return cleaned_text

In [6]:
response_schemas = [
    ResponseSchema(name="Language", description="Language used in the article"),
    ResponseSchema(name="Length", description="Length of the article in words"),
    ResponseSchema(name="Grade_Level", description="Grade level of the content"),
    ResponseSchema(name="Discipline", description="Discipline or subject of the article"),
    ResponseSchema(name="Blooms_Level", description="Bloom's taxonomy level of the content"),
    ResponseSchema(name="Learning_Objectives", description="Learning objective of the content"),
    ResponseSchema(name="Learning_Facets", description="Learning facets addressed in the content")
]

In [7]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [8]:
example = '''{
    "Language": "language used",
    "Length": "length of the article in words",
    "Grade_Level": "grade level",
    "Discipline": "discipline",
    "Blooms_Level": "bloom's level",
    "Learning_Objectives":"learning objectives",
    "Learning_Facets":"learning facets"
}'''

In [9]:
template = """\
You are an educational curriculum expert system tasked to find specific information for the article. 
Identify the following items from the review text:
- Language used in the article
- Length of the article in words
- Grade level of the content (e.g., 10th grade, 12th grade, University)
- Discipline or subject of the article
- Bloom's taxonomy level of the content
- Learning objective of the content
- Learning facets of the content
The review text is delimited with triple backticks.

Take your time to analyze the review text and find the required information.
Output the response in the desired format as given in {example} but don't use the same value.

Make your response as short as possible.
Make sure to answer in the correct format.
Review text: ```{article}```

{format_instructions}

If the information isn't present, use 'unknown' as the value, not null or None.
"""

In [10]:
prompt = ChatPromptTemplate.from_template(template=template)

In [11]:
chain = prompt | llm

In [12]:
chain

ChatPromptTemplate(input_variables=['article', 'example', 'format_instructions'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['article', 'example', 'format_instructions'], template="You are an educational curriculum expert system tasked to find specific information for the article. \nIdentify the following items from the review text:\n- Language used in the article\n- Length of the article in words\n- Grade level of the content (e.g., 10th grade, 12th grade, University)\n- Discipline or subject of the article\n- Bloom's taxonomy level of the content\n- Learning objective of the content\n- Learning facets of the content\nThe review text is delimited with triple backticks.\n\nTake your time to analyze the review text and find the required information.\nOutput the response in the desired format as given in {example} but don't use the same value.\n\nMake your response as short as possible.\nMake sure to answer in the correct format.\nReview text: ```{article

In [13]:
pdf_dir = r"C:\Users\ual-laptop\Downloads\resources\Chapter"

In [14]:
all_files = os.listdir(pdf_dir)

In [15]:
pdf_files = [file for file in all_files if file.endswith('.pdf')]

In [None]:
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_dir, pdf_file)
    print(f"Processing file: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)
    cleaned_text = clean_text(text)
    inputs = {
    "article": cleaned_text,
    "example": example,
    "format_instructions": format_instructions
    }
    response = chain.invoke(inputs)
    print(response)

Processing file: C:\Users\ual-laptop\Downloads\resources\Chapter\10-Sc-Acid-Bases-and-Salts-Notes.pdf
{
"Language": "English",
"Length": "1000-2000 words",
"Grade_Level": "High School",
"Discipline": "Chemistry",
"Blooms_Level": "Reasoning",
"Learning_Objectives": "Understand the concept of bleaching powder and its preparation. Identify the importance of bleaching powder in various industries.",
"Learning_Facets": ["Comprehension", "Application"]
}
Processing file: C:\Users\ual-laptop\Downloads\resources\Chapter\100per_science_cl10_nf_ch3.pdf


In [None]:
data_dict = output_parser.parse(response)

In [None]:
data_dict

In [None]:
language = data_dict.get("Language")
length = data_dict.get("Length")
grade_level = data_dict.get("Grade_Level")
discipline = data_dict.get("Discipline")
blooms_level = data_dict.get("Blooms_Level")
Learning_Objectives = data_dict.get("Learning_Objectives")
Learning_Facets = data_dict.get("Learning_Facets")

In [None]:
print(f"Language: {language}")
print(f"Length: {length}")
print(f"Grade Level: {grade_level}")
print(f"Discipline: {discipline}")
print(f"Blooms Level: {blooms_level}")
print(f"Learning_Objectives: {Learning_Objectives}")
print(f"Learning_Facets: {Learning_Facets}")