In [17]:
!pip install python-dotenv
!pip install together
!pip install langchain-together
!pip install langchain
!pip install langchain-community
!pip install psycopg2-binary


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgr

In [None]:
!pip freeze > requirements.txt

In [1]:
%load_ext dotenv
%dotenv

In [2]:
import os
from typing import List, Optional
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import psycopg2
API_TOKEN: Optional[str] = os.getenv("LLM_API_TOKEN")
DB_CONN_STRING: Optional[str] = os.getenv("DB_CONN_STRING")

## Pydantic Classes for SDE

In [3]:
from pydantic import BaseModel, Field
from typing import Deque, List, Optional, Tuple

# Need to somehow cross check category
class Issue(BaseModel):
    """A single issue, problem, or event that can be addressed by political parties, typically for them to form policy around."""
    
    description: str = Field(description="""
    A long (max 300 characters) description of the issue at hand
    It should not be tied to any political party and simply be a description of the issue discussed.
    No details of a solution should be included, remain very general. Describe the **overarching** issue at hand.
    it should be able to be answered with a binary agree/disagree statement. Required.
    """)
    summary: str = Field(description="A short (max 50 characters), concise, summary of the issue. again, it should not be tied to any political party and simply be a description of the issue discussed. Required.")

class Stance(BaseModel):
    """A political party's stance on an existing issue"""
    
    issue_id: int = Field(description = "The ID of issue that is being discussed")
    stand: bool = Field(description = "A boolean value indicating whether the party disagrees or agrees with the contents of the issue")
    reason: str = Field(description = "A description on why the party has this stance, maximum 1000 characters")
    evidence: str = Field(description = "Lines verbatim from the document that support this stance")

class StanceList(BaseModel):
    """A list of stances for issues that are explicitly commented on in the document. If no issues are discussed, this list should be empty."""
    
    stances: Optional[List[Stance]] = Field(
        description="The list of stances. Only include items if the stance is clearly and explicitly stated in the document. Do not include entries for issues not discussed."
    )

class IssueList(BaseModel):
    "A list of issues addressed in the context provided"

    issues: List[Issue]

class Party(BaseModel):
    "A political party"

    party_id: int = Field(description = "The exact ID of the party in question")
    name: str = Field(description = "The name of the party")
    short_name: str = Field(description = "The short name of the party, usually 2-3 letters long")

## Reading WP manifesto data (from .txt)
We split the text into chunks of 6000. We use this approach because these texts are very information-dense. It is unlikely that by splitting the text, we miss data that needs to be captured. We risk duplicate items, though

In [4]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import TokenTextSplitter

loader = DirectoryLoader("./data/", loader_cls=TextLoader)
docs = loader.load()


text_splitter = TokenTextSplitter(
    # Controls the size of each chunk
    chunk_size=5800,
    # Controls overlap between chunks
    chunk_overlap=20,
)

texts = text_splitter.split_text(docs[0].page_content)

In [5]:
model = init_chat_model("meta-llama/Llama-3.3-70B-Instruct-Turbo-Free", model_provider="together", api_key=API_TOKEN)

# Obtain the Party in question

In [8]:
conn = psycopg2.connect(DB_CONN_STRING)
cur = conn.cursor()

cur.execute('SELECT * FROM "Party";')

# Fetch all rows
rows = cur.fetchall()

# Clean up
cur.close()
conn.close()

In [9]:
rows

[(1,
  'Coalition for Shakira',
  'CFS',
  'https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fmedia1.tenor.com%2Fm%2FnsIzrgTUb6sAAAAC%2Fmonday-left-me-broken-cat.gif&f=1&nofb=1&ipt=037bc199f0a9705f2ffda49a41302c4a674c8d69748df626cd8e70491f1f379d',
  '#FFD700',
  True),
 (2,
  "Traditionalists' Party",
  'TP',
  'https://external-content.duckduckgo.com/iu/?u=http%3A%2F%2Fpkbnews.in%2Fwp-content%2Fuploads%2F2023%2F09%2FBlue-Smurf-Cat-Meme.jpg&f=1&nofb=1&ipt=075c2e738b6abfc14555b49cfe8fe2d14433f12cdec84ab46b87516cca95278f',
  '#1E90FF',
  True),
 (4, 'Workers Party', 'WP', 'www.com', 'red', True)]

In [10]:
party_data = list(map(lambda p: f"(ID: {p[0]}, Name: {p[1]}, ShortName: {p[2]})", rows))
party_data = ", ".join(party_data)

In [13]:
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Extract the correct party that the document is referring to from the predefined list of parties below"
        ),
        ("system", "Party List: " + party_data),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)
extractor = prompt_template | model.with_structured_output(schema=Party)

In [16]:
party = extractor.invoke(texts[0])
print(party)

party_id=4 name='Workers Party' short_name='WP'


## Create an extractor for issues

In [24]:
structured_llm = model.with_structured_output(schema=IssueList)

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Extract issues from the text, keeping the issue generalizable. Ensure that character limits are adhered to."
            "It should make sense to assign agree/disagree values to Issues."
        ),
        ("human", "{text}"),
    ]
)
extractor = prompt_template | structured_llm

## Extract issues

In [26]:
extractions = extractor.batch(
    [{"text":text} for text in texts],
    {"max_concurrency":2},
)

In [27]:
issues = []
for extraction in extractions:
    issues.extend(extraction.issues)

# Insert issues into database
TODO: check before adding

In [29]:
conn = psycopg2.connect(DB_CONN_STRING)

In [30]:
data = list(map(lambda i: (i.description, i.summary), issues))

In [31]:
cur = conn.cursor()
cur.executemany('INSERT INTO "Issue" ("Description", "Summary", "Active") VALUES (%s, %s, false);', data)

# Clean up
conn.commit()
cur.close()
conn.close()

# GET issues from database with ID

In [32]:
party_id = 4

In [10]:
# Above is placeholder !!!!

In [33]:
conn = psycopg2.connect(DB_CONN_STRING)
cur = conn.cursor()

# Obtain ONLY the issues that are not answered by this party yet
cur.execute("""
SELECT * FROM "Issue" i
WHERE NOT EXISTS (
    SELECT 1
    FROM "Stance" s
    WHERE i."IssueID" = s."IssueID"
    AND s."PartyID" = %s
);
""", (party_id,))
rows = cur.fetchall()


cur.close()
conn.close()

## Extract stances

Workflow: Get issues from database and/or from extracted values
Use this to inform stance generation

In [35]:
# Split into chunks of 10 questions each
issue_chunks = [rows[i:i + 10] for i in range(0, len(rows), 10)]

In [34]:
from langchain_core.utils.function_calling import tool_example_to_messages
from langchain_core.messages import SystemMessage, HumanMessage

examples = [
    (
        "Singapore should ban cars in order to hit our carbon green targets by 2040",
        StanceList(stances=[
            Stance(issue_id=2, stand=True, reason="To hit carbon green targets", evidence="Singapore should ban cars in order to hit our carbon green targets by 2040")
        ]),
        "(ID: 123, Description: 'Singapore should allow cars to drift in roads')\n(ID: 2, Description: 'Singapore should ban cars.')"
    ),
    (
        "Singapore should deploy more soldiers overseas to assist in foreign aid efforts",
        StanceList(stances=[]),
        "(ID: 124, Description: 'Singapore should allow dogs to fly.')\n(ID: 2, Description: 'Singapore should ban cars.')"
    ),
]


messages = []

for txt, tool_call, issueStr in examples:
    if tool_call.stances:
        # This final message is optional for some providers
        ai_response = "Detected a stance related to one of the provided issues, returning stance list with only 1 stance."
    else:
        ai_response = "Detected no stances related to any of the provided Issues, returning empty stance list"
    example = tool_example_to_messages(txt, [tool_call], ai_response=ai_response)
    messages.extend([HumanMessage(content="Example Issue List: " + issueStr)] + example)

In [37]:
# Setup the prompt and extractor
from langchain.globals import set_verbose

set_verbose(True)
stance_prompt_template = ChatPromptTemplate.from_messages(
        [
            (
                #"system",
                #"You are a strict and precise extraction algorithm."
                #"Your job is to extract a stance taken in the document if and only if:"
                #"a) the stance taken directly corresponds to the description of an issue, of which the issue list will be provided\n"
                #"b) the stance is clear and explicit, and not an inference of unstated opinions or tones. **DO NOT** create a stance stating that a party does not agree just because they did not comment on a particular issue. \n"
                #"c) there is no other stance for this issue - that is, there should only be a maximum of 1 stance per issue, and a minimum of 0 stances per issue\n"
                #"Only use the issues provided below (Issue List: [....]). **DO NOT** refer to any new issues.\n"
                #"IMPORTANT: Do not return a stance for any issue that is not explicitly mentioned and discussed in the document. If the issue is not discussed, it must be left out of the result entirely. **Do not** return any stances that say 'no comment' or 'not mentioned'. These are considered invalid."
                "system",
                "You are an political extraction algorithm that does not paraphrase, is neutral, and does not assume anything."
                "Your job is to extract 0 or more stances in a document if and only the stance directly corresponds to an issue with an existing ID."
                " Only refer to the **latest** issue list provided. Do not make any leap in judgements. The stance should be exactly related to the issue at hand."
            ),
            MessagesPlaceholder("examples"),  # <-- EXAMPLES!
            ("human", "Issue List: [{issues}]"),
            ("human", "{text}"),
        ]
    )
stance_extractor = stance_prompt_template | model.with_structured_output(StanceList)

In [47]:
# Iterate over issue chunks, format them and then extract
extracted = []
for issue_list in issue_chunks:
    issueStr = "\n".join(f"(ID: {i[0]}, Description: {i[1]})" for i in issue_list)
    # determines chunk size for input
    # max_tokens - issues length - max return tokens - buffer
    remainding_tokens = 8193 - len(issueStr) - 2143 - 500
    text_splitter = TokenTextSplitter(
        # Controls the size of each chunk
        chunk_size=remainding_tokens,
        # Controls overlap between chunks
        chunk_overlap=20,
    )
    chunked_text = text_splitter.split_text(docs[0].page_content)
    for text in chunked_text:
        extractions = stance_extractor.invoke({"text": text, "issues": issueStr, "examples": messages})
        extracted.extend(extractions)

In [None]:
extracted

In [None]:
list(filter(lambda x: x[0] == 138, rows))