# test api 

In [1]:
import pandas as pd #all functions need to be pre-pended with 'pd.' e.g. the DataFrame function must be written as 'pd.DataFrame'
import numpy as np
from openai import OpenAI
import tiktoken
import time
import os 


from helper_file import chunk_text_with_overlap, RateLimiter, get_model_response, find_overlap, merge_chunks

enc = tiktoken.encoding_for_model('gpt-3.5-turbo')

load_path = './data/northern_star_raw'
save_path = './data/northern_star_recovered'

In [2]:
file_name = 'northern_star_1837-12-02_ed_1_1_p_1.txt'

In [3]:
with open(os.path.join(load_path,file_name), 'r') as file:
    # Read the file and split into a list at each line break
    lines = file.read().split('\n')

In [4]:
def group_by_running_total(df, column, threshold):
    """
    Groups rows of a DataFrame based on a running total of a specified column.
    
    Parameters:
    - df: pandas DataFrame.
    - column: The column name (string) on which the running total is calculated.
    - threshold: The value at which the group should change.
    
    Returns:
    - DataFrame with an additional column 'group' indicating the group number.
    """
    running_total = 0
    group_number = 0
    groups = []  # List to hold the group numbers
    
    for value in df[column]:
        if running_total + value > threshold:
            running_total = value  # Reset running total for the new group
            group_number += 1  # Increment group number
        else:
            running_total += value  # Add to the running total
        
        groups.append(group_number)
    
    df['group'] = groups
    return df


import tiktoken  # Ensure tiktoken library is installed

def split_strings_by_tokens(strings, max_tokens):
    """
    Splits strings in the list that exceed max_tokens into two parts and replaces
    the original string with these two parts, based on token count.

    Parameters:
    - strings: List of strings to be processed.
    - max_tokens: Maximum number of tokens allowed before splitting a string.

    Returns:
    - A new list of strings where strings longer than max_tokens are split into two.
    """
    result = []  # Initialize the result list
    enc = tiktoken.encoding_for_model('gpt-3.5-turbo')  # Encoding for token counting
    
    for string in strings:
        token_count = len(enc.encode(string))
        if token_count > max_tokens:
            # Split the string into words for more granular control
            words = string.split()
            # Attempt to split the string into two parts, near the middle, but adjusted for token count
            for split_index in range(len(words) // 2, len(words)):
                first_half = ' '.join(words[:split_index])
                second_half = ' '.join(words[split_index:])
                
                # Check if the split results in both halves being under the max_tokens limit
                if len(enc.encode(first_half)) <= max_tokens and len(enc.encode(second_half)) <= max_tokens:
                    result.extend([first_half, second_half])
                    break
        else:
            result.append(string)
    
    return result


In [48]:
#split list up to make it able to be passed to gpt
lines = split_strings_by_tokens(lines, 3000)

token_data = pd.DataFrame({'tokens':[len(enc.encode(line)) for line in lines]})

In [49]:
grouped_df = group_by_running_total(token_data, 'tokens', 1500)

In [50]:
grouped_df.groupby('group')['tokens'].max()

group
0      391
1      450
2      526
3     1054
4     1162
5     1123
6      269
7     1246
8     1198
9      876
10    1171
11     945
Name: tokens, dtype: int64

In [42]:
grouped_df.groupby('group').apply(lambda x: x.index[-1])

group
0     16
1     27
2     37
3     43
4     44
5     50
6     59
7     60
8     63
9     66
10    69
11    71
dtype: int64

In [43]:
chunk = '\n'.join(lines[0:17])

In [9]:
prompt_text = f"""The below text is from "The Northern Star" a newspaper based in Leeds UK. the edition is from 2nd December 1837. 
The text cover may multiple articles and adverts. Each piece within the newspaper page is separated by at least one line, titles are on their own line, 
adverts begin "AD" with a number after with no space after the letters. Please recover the OCR and format the text appropriately':::'
::: {chunk} :::
"""

rate_limiter = RateLimiter(50000)

response = get_model_response(prompt_text, 'You are an expert in recovery of poor quality OCR.', 
                                rate_limiter, engine="gpt-3.5-turbo").choices[0].message.content



In [21]:


with open(os.path.join(save_path, file_name), 'w') as file:
    # Write the text string to the file
    file.write(response)


In [51]:
response_list = []

rate_limiter = RateLimiter(50000)

chunk_num = 0

group_list = grouped_df.groupby('group').apply(lambda x: x.index[-1]).to_list()

group_start = 0

for group in group_list:
    start_time = time.time()  # Start timing

    group_end = group + 1
    chunk = '\n'.join(lines[group_start:group_end])

    prompt_text = f"""The below text is from "The Northern Star" a newspaper based in Leeds UK. the edition is from 2nd December 1837. 
The text cover may multiple articles and adverts. Each piece within the newspaper page is separated by at least one line, titles are on their own line, 
adverts begin "AD" with a number after with no space after the letters. Please recover the OCR and format the text appropriately':::'
::: {chunk} :::
"""

    response = get_model_response(prompt_text, 'You are an expert in recovery of poor quality OCR.', 
                                rate_limiter, engine="gpt-4-0125-preview").choices[0].message.content
    
    response_list.append(response)

    end_time = time.time()  # End timing
    chunk_time = end_time - start_time  # Calculate the time taken for this chunk

    group_start = group_end

    print(f"text chunk: {chunk_num}/{len(group_list)} complete, time taken {chunk_time:.2f} seconds")
    chunk_num += 1


text chunk: 0/12 complete, time taken 40.17 seconds
text chunk: 1/12 complete, time taken 69.82 seconds
text chunk: 2/12 complete, time taken 24.98 seconds
text chunk: 3/12 complete, time taken 36.21 seconds
text chunk: 4/12 complete, time taken 31.09 seconds
text chunk: 5/12 complete, time taken 61.16 seconds
text chunk: 6/12 complete, time taken 27.21 seconds
text chunk: 7/12 complete, time taken 54.83 seconds
text chunk: 8/12 complete, time taken 37.89 seconds
text chunk: 9/12 complete, time taken 21.54 seconds
text chunk: 10/12 complete, time taken 36.34 seconds
text chunk: 11/12 complete, time taken 29.34 seconds


In [52]:


with open(os.path.join(save_path, 'page_1_gpt4.txt'), 'w') as file:
    # Write the text string to the file
    file.write(' '.join(response_list))