In [None]:
import langchain
import ollama
import os
import pypdf
import textract # appears broken for pdf on windows due to shell call
import warnings

In [None]:
# set variables to point at documents
# Document 1 is from: https://www.nature.scot/sites/default/files/2017-06/A2003405%20-%20Scotland%27s%20Wild%20Deer_%20A%20National%20Approach%202015-2020%20Priorities%234.pdf
document_1_location = "C:/Users/justi/Documents/GitHub/SmallProjects/documents/Scotland's Wild Deer_ A National Approach 2015-2020 Priorities.pdf"
# Document 2 is from: https://forestryandland.gov.scot/images/corporate/pdf/deer-management-on-scotlands-national-forest-estate.pdf
document_2_location = "C:/Users/justi/Documents/GitHub/SmallProjects/documents/deer-management-on-scotlands-national-forest-estate.pdf"

In [None]:
def extract_text(document_location: str) -> str:
    extension = os.path.splitext(document_location)[-1].lower()
    if extension == '.txt':
        with open(document_location) as document:
            full_text = document.read()
    elif extension == '.pdf':
        # https://pypdf.readthedocs.io/en/stable/user/extract-text.html
        reader = pypdf.PdfReader(document_location)
        number_of_pages = len(reader.pages)
        full_text = ""
        # extract text page by page
        for page in reader.pages:
            text = page.extract_text()
            full_text = full_text + text
        # remove linebreaks
        full_text = full_text.replace('\n', ' ')
    else:
        # try to use textract - warning may be buggy - certainly fails pdfs
        # https://textract.readthedocs.io/en/stable/
        full_text = textract.process(document_location)
    return full_text 

In [None]:
# extract our documents to plain text
text_1 = extract_text(document_1_location)
text_2 = extract_text(document_2_location)

In [None]:
def recursive_summariser(
    text: str,
    model: str ='llama3:8b',
    input_chunk_tokens: int = 5000, # characters not tokens
    chunk_overlap: int = 20,
    compression_ratio: float = 5.0,
    max_summary_length: int = 5000, # characters not tokens
):
    """
    This function takes a text input and recursively summarises it until the summary is less than a specified maximum length.
    The function splits the input text into chunks, summarises each chunk, and then concatenates the summaries. 
    If the concatenated summary is still too long, the function calls itself recursively on the summary to compress further.

    Parameters:
    text (str): The text to be summarised.
    model (str, optional): The model to be used for summarisation. 
        Defaults to 'llama3:8b'.
    input_chunk_tokens (int, optional): The maximum number of characters in each chunk of text to be summarised. 
        Defaults to 5000.
    chunk_overlap (int, optional): The number of characters that consecutive chunks overlap. 
        Defaults to 20.
    compression_ratio (float, optional): The desired compression ratio for the summarisation. 
        Defaults to 5.0.
    max_summary_length (int, optional): The maximum length of the final summary. 
        Defaults to 5000.

    Returns:
    str: The summary of the input text.
    """
    
    input_length = len(text)
    print("Input_text_length: " + str(len(text)))
    # see if we can achieve our desired compressions ratio with a less agressive compression ratio
    required_compression = input_length / max_summary_length
    if required_compression < compression_ratio:
        # this could be a higher compression ration than originally specified, but only for edge cases
        # should ensure sufficient compression on the last round of compression to reach a size close to optimal
        compression_ratio = required_compression * 1.3 
        print(f"changed compressions ratio to : {compression_ratio}")
    # split the passage based on the token length
    text_splitter = langchain.text_splitter.RecursiveCharacterTextSplitter(
        chunk_size=input_chunk_tokens, # characters not words
        chunk_overlap=chunk_overlap,
        # add full stops and commas to the separators to try to get the most sensible splitting 
        separators=["\n\n", "\n", ".", ",", " ", ""],
        length_function=len,
    #     is_separator_regex=False,
    )
    text_chunks = text_splitter.split_text(text)
    # handle the edge case that we are already down to a chunk smaller than the chunk size
    if len(text_chunks) <= 1:
        return text
    # summarise each chunk in turn
    responses = []
    for chunk in text_chunks:
        word_count = len(chunk.split(" "))
        desired_word_count = int(word_count//compression_ratio)
        print(f"chunk length: {len(chunk)}, word count: {word_count}, desired word count: {desired_word_count}")
        query = f"""
You are a professional document summariser
Please summarise the following text from {word_count} words, down to {desired_word_count} words. 
Be careful to retain as much of the overall meaning of the text as possible in your summary. 
Include nothing but the summary in your reply. 
Do not say how many words it is summarised to. Do not mention that it is a summary.

Text:
{chunk}
"""
        response = ollama.chat(model='llama3:8b', messages=[
          {
            'role': 'user',
            'content': query,
          },
        ])
        response_text = response['message']['content']
        responses.append(response_text)
    
    # concatenate the chunks
    summary = "\n".join(responses)
    print("Summary length in characters: " + str(len(summary)))
    
    # check summary length
    summary_splitter = langchain.text_splitter.RecursiveCharacterTextSplitter(
        chunk_size=max_summary_length, # characters not words
        chunk_overlap=0,
        length_function=len,
    #     is_separator_regex=False,
    )
#     summary_token_count = summary_splitter.count_tokens(summary)
    summary_chunks = text_splitter.split_text(summary)
    # if summary is short enough return it
    if len(summary_chunks) <= 1:
        return summary
    # otherwise call recursive_summariser on the summary
    else:
        return recursive_summariser(summary)
    
def tidy_text(
    text: str,
    model: str ='llama3:8b',
):
    """
    This function takes a text string as input and tries to return a professionally edited version of the text.
    It does this using an LLM served via Ollama

    Parameters:
    text (str): The text to be edited.
    model (str, optional): The model to be used for comparison. 
        Defaults to 'llama3:8b'.

    Returns:
    str: The edited text.

    Note:
    The function uses the 'ollama' chat model for the editing process. The edited text does not mention that it is edited.
    """
        
    query = f"""
You are a professional copy editor
Please edit the following text ensuring that it maintains a consistent grammatical style throughout.
Remove any references to summarisation or word counts.
Ensure you preserve all the factual meaning, but remove any incoherent text.
Break the text into meaningful paragraphs.
Your edited text should be approximately the same number of words as the original.
Your edited text must be written in British rather than American English.
Include nothing but the edited text in your reply. Do not mention that it is edited.

Text:
{text}
            """
    edited_summary = ollama.chat(model='llama3:8b', messages=[
      {
        'role': 'user',
        'content': query,
      },
    ])
    return edited_summary['message']['content']

In [None]:
raw_summary_1 = recursive_summariser(text_1)
tidy_summary_1 = tidy_text(raw_summary_1)
print(tidy_summary_1)

In [None]:
raw_summary_2 = recursive_summariser(text_2)
tidy_summary_2 = tidy_text(raw_summary_2)
print(tidy_summary_2)

In [None]:
def compare_summaries(
    summary_1: str,
    summary_2: str,
    model:str = 'llama3:8b',
    max_length:int = 10000,
):
    """
    This function compares two summary documents using a specified model served via Ollama
    It returns the similarities and differences between them.
    The comparison focuses on the content and emphasis of the summaries, rather than their style or phrasing.

    Parameters:
    summary_1 (str): The first summary to be compared.
    summary_2 (str): The second summary to be compared.
    model (str, optional): The model to be used for comparison. 
        Defaults to 'llama3:8b'.
    max_length (int, optional): The maximum length of the query in characters. 
        Defaults to 10000.

    Returns:
    str: The similarities and differences between the two summaries.

    Raises:
    Warning: If the length of the query exceeds the max_length in characters, a warning message is printed.
    """
    
    query = f"""
Please describe the similarities and differences between the following two passages. 
Focus on similarites and differences of content and emphasis, rather than style or phrasing.
Provide your answers as two numbered lists labelled SIMILARITIES and DIFFERENCES.

Passage 1:
{summary_1}

passage 2:
{summary_2}
"""
    if len(query) > max_length:
        print("Warning query may exceed context window")
    response = ollama.chat(model=model, messages=[
      {
        'role': 'user',
        'content': query,
      },
    ])
    answer = response['message']['content']
    return(answer)

In [None]:
comparison = compare_summaries(tidy_summary_1, tidy_summary_2)
print(comparison)

In [None]:
def compare_strategies(
    summary_1: str,
    summary_2: str,
    model:str = 'llama3:8b',
    max_length:int = 10000,
):
    """
    This function compares two summary documents using a specified model served via Ollama
    It returns the similarities and differences between them.
    The comparison focuses on thestrategy outlined in the summaries, rather than their style or phrasing.

    Parameters:
    summary_1 (str): The first summary to be compared.
    summary_2 (str): The second summary to be compared.
    model (str, optional): The model to be used for comparison. Defaults to 'llama3:8b'.
    max_length (int, optional): The maximum length of the query in characters. Defaults to 10000.

    Returns:
    str: The similarities and differences between the two summaries.

    Raises:
    Warning: If the length of the query exceeds the max_length in characters, a warning message is printed.
    """
    query = f"""
You are an enviromental strategy expert.
Please describe the similarities and differences in the strategies outlined by the following passages. 
Focus on similarites and differences of strategy, rather than style, tone or phrasing.
Provide your answers as two numbered lists labelled SIMILARITIES and DIFFERENCES.

Passage 1:
{summary_1}

passage 2:
{summary_2}
"""
    if len(query) > max_length:
        print("Warning query may exceed context window")
    response = ollama.chat(model=model, messages=[
      {
        'role': 'user',
        'content': query,
      },
    ])
    answer = response['message']['content']
    return(answer)

In [None]:
strategy_comparison = compare_strategies(tidy_summary_1, tidy_summary_2)
print(strategy_comparison)