# **Assignment:**

Create an application to extract summary, language and keywords from a document weblink.

Expected output should be in json format:

{'summary': summary of the doc, 'language': language of the doc, 'keywords': extracted keywords}


Procedure(Hint):


Create a loader that loads a doc via a link

load the doc

check the page content

Create a pydantic class with description to extract summary, language and keywords.

Create a prompt template

Within the prompt template describe the template, input variables

Create your chain: prompt | model | parser

Parser should be Json Output Parser(this will return a json format output)

Invoke your chain


In [None]:
!pip install -q groq langchain-groq langchain langchain-community litellm transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/108.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.3/409.3 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install langchain langchain_openai python-dotenv pydantic beautifulsoup4 requests



In [None]:
import os
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain.document_loaders import WebBaseLoader
from pydantic import BaseModel, Field
from typing import List
from dotenv import load_dotenv

In [None]:
def load_document(url: str) -> str:
    loader = WebBaseLoader(url)
    docs = loader.load()
    return docs[0].page_content

In [None]:
class DocumentAnalysis(BaseModel):
    summary: str = Field(description="A concise summary of the document content")
    language: str = Field(description="The primary language used in the document")
    keywords: List[str] = Field(description="Key terms or phrases from the document")

In [None]:
template = """Analyze the following document content and provide:
1. A concise summary
2. The primary language used
3. Important keywords or key phrases

Document content: {document_content}

Provide the output in JSON format with the following structure:
{format_instructions}
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["document_content"],
    partial_variables={"format_instructions": JsonOutputParser(pydantic_object=DocumentAnalysis).get_format_instructions()}
)

In [None]:
def create_chain():
    model = ChatGroq(temperature=0)
    parser = JsonOutputParser(pydantic_object=DocumentAnalysis)
    chain = prompt | model | parser
    return chain

In [None]:
def analyze_document(url: str):
    try:
        content = load_document(url)
        chain = create_chain()
        result = chain.invoke({"document_content": content})
        return result
    except Exception as e:
        return {"error": str(e)}

os.environ["GROQ_API_KEY"] = "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"

# Test with a URL
url = "https://vit.ac.in/all-courses/pg"
result = analyze_document(url)
print(result)

{'summary': 'This document is about VIT PG admissions and courses offered, including VITEEE 2025 application, MBA 2025 application, Ph.D/Direct Ph.D online application, UG foreign applications, and research guide directory. It also includes information about VIT campuses, scholarships, and research centers.', 'language': 'English', 'keywords': ['VIT', 'PG admissions', 'VITEEE 2025', 'MBA 2025', 'Ph.D/Direct Ph.D', 'UG foreign applications', 'research guide directory', 'VIT campuses', 'scholarships', 'research centers']}
