In [7]:
from typing import List
from dotenv import load_dotenv
import boto3

load_dotenv()


class Post:
    content: str
    filename: str

    def __init__(self, content: str, filename: str) -> None:
        self.content = content
        self.filename = filename

    def __str__(self) -> str:
        return f"""File Name: {self.filename}\n\nContent:\n\n{self.content}"""


def get_posts_from_bucket(bucket: str) -> List[Post]:
    s3_client = boto3.client("s3")
    post_objects = list(
        filter(
            lambda object: object["Key"].endswith(".md"),
            s3_client.list_objects_v2(Bucket=bucket).get("Contents", []),
        )
    )

    filenames = list(map(lambda object: object["Key"], post_objects))

    posts: List[Post] = []

    for name in filenames:
        object = s3_client.get_object(Bucket=bucket, Key=name)
        content = object["Body"].read().decode("utf-8")

        new_post = Post(content=content, filename=name)
        posts.append(new_post)

    return posts


def read_markdown_file(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
    return content


posts = get_posts_from_bucket(bucket="be-awesome-dev-posts")


In [64]:
from langchain_core.prompts import ChatPromptTemplate
from typing_extensions import Annotated, TypedDict
from pydantic import Field, BaseModel


class Summary(BaseModel):
    main_topic: str = Field(
        description="The overall topic of the whole article, this should be only 10 words long at maximum",
    )
    summary_content: str = Field(
        description="The content of the summary, this should be very concise and should only be 10 sentences max in length.",
    )


# class Summary(TypedDict):
#     main_topic: str = Annotated[
#         str,
#         ...,
#         "The overall topic of the whole article, this should be only 10 words long at maximum",
#     ]
#     summary_content: str = Annotated[
#         str,
#         ...,
#         "The content of the summary, this should be very concise and should only be 10 sentences max in length.",
#     ]


def init_summary_chain(llm):
    system_prompt_template = """
  You are a helpful assistant that can give summary over markdown documents.
  Given a markdown article, summarise its content with the following requirement:

  * DO NOT write code in your summary.
  * DO NOT include code examples in your summary, you must keep the summary as concise as possible.
  * For each article, briefly describe what it is about overall and mention the main topics without further explanation.
  * Be as concise as possible with your summary.

  {format_instructions}

  Below is the markdown name of the article and its content:

  Article name: 

  {article_name}

  Content:

  {content}
  """

    prompt = ChatPromptTemplate.from_template(system_prompt_template)

    chain = prompt | llm

    return chain


def generate_summaries(
    posts: List[Post],
    summary_chain,
    output_file_name: str,
    format_instructions: str = "",
):
    summaries: List[Summary] = []
    for post in posts:
        post_content = post.content
        post_name = post.filename
        response: Summary = summary_chain.invoke(
            {
                "content": post_content,
                "article_name": post_name.split("/")[-1],
                "format_instructions": format_instructions,
            }
        )

        print(response)
        summaries.append(response)

    with open(output_file_name, "w", encoding="utf-8") as output_file:
        for s in summaries:
            output_file.write(f"Topic: {s.main_topic}\n")
            output_file.write(f"Summary:\n{s.summary_content}\n\n")
            output_file.write("\n\n===================================\n\n")
            print(f"Written summary content of {s.main_topic} to file")

In [36]:
from langchain_ollama import ChatOllama


ollama_model = ChatOllama(
    model="llama3.1:8b", temperature=0, verbose=True, num_ctx=10000
).with_structured_output(Summary)

ollama_chain = init_summary_chain(llm=ollama_model)
generate_summaries(
    posts=posts, summary_chain=ollama_chain, output_file_name="ollama_summary.md"
)


main_topic='Modern Javascript Syntax' summary_content='This article provides a summary of modern javascript syntax, including the spread operator, destructuring operator, shorthand syntax, optional chaining operator, and arrow function. It also covers their usage and examples.'
main_topic='Array Methods in JavaScript' summary_content='This article introduces popular array methods in JavaScript, including .map(), .forEach(), .filter(), .find(), .findIndex(), .reduce(), .some(), .every(), .slice(), and .splice(). Each method is explained with examples, highlighting their usage and benefits. The article aims to provide a comprehensive guide for developers to efficiently work with arrays in JavaScript.'
main_topic='Understanding Object Methods in JavaScript' summary_content="This article explains object methods in JavaScript. It covers the Object.keys(), Object.values(), and Object.entries() methods for working with objects, as well as the Object.assign() method for copying object content.

In [67]:
from langchain_aws import ChatBedrockConverse, ChatBedrock
from langchain_core.output_parsers import PydanticOutputParser


def init_chat_model():
    boto_session = boto3.Session(region_name="us-west-2")
    sts_client = boto_session.client("sts")

    assumed_role = sts_client.assume_role(
        RoleArn="arn:aws:iam::629872170007:role/bedrock-consumer",
        RoleSessionName="be-awesome-dev-bedrock-consumer",
    )

    credentials = assumed_role["Credentials"]

    bedrock_client = boto3.client(
        "bedrock-runtime",
        aws_access_key_id=credentials["AccessKeyId"],
        aws_secret_access_key=credentials["SecretAccessKey"],
        aws_session_token=credentials["SessionToken"],
        region_name="us-west-2",
    )

    model = ChatBedrock(
        model_id="meta.llama3-1-70b-instruct-v1:0",
        region_name="us-west-2",
        client=bedrock_client,
    )

    return model


# output_parser = PydanticOutputParser(pydantic_object=Summary)
bedrock_llm = init_chat_model().with_structured_output(Summary)
bedrock_chain = init_summary_chain(llm=bedrock_llm)
# generate_summaries(
#     posts=posts,
#     summary_chain=bedrock_chain,
#     output_file_name="bedrock_summary.md",
#     # format_instructions=output_parser.get_format_instructions(),
# )
response = bedrock_chain.invoke(
    {
        "content": posts[0].content,
        "article_name": posts[0].filename,
        "format_instructions": "",
    }
)
print(response)


None
