# Assessing the attitudes towards the proposed new ACMA powers to combat misinformation and disinformation


## Generate prompt

This function takes the submission and submission author (from doc name) as a
parameter and returns the formatted prompt to be sent to the LLM


In [1]:
def prompt_formatted(submission_string: str, submission_author: str) -> str:    
    # Read the first file and set a string variable
    with open('prompt.txt', 'r') as file:
        prompt = file.read()
        
    with open('prompt_issues.md', 'r') as file:
        issues = file.read()

    with open('prompt_guidance_note.md', 'r') as file:
        guidance_note = file.read()

    with open('prompt_fact_sheet.md', 'r') as file:
        fact_sheet = file.read()

    prompt = prompt.replace('|issues|', issues)
    prompt = prompt.replace('|guidance_note|', guidance_note)
    prompt = prompt.replace('|fact_sheet|', fact_sheet)

    prompt += "\n\n***************************************** SUBMISSION START *****************************************\n\n"

    prompt += f"Submission from: {submission_author}\n\n"
    
    prompt += submission_string

    prompt += "\n\n***************************************** SUBMISSION END *****************************************\n\n"

    return prompt

## Get AI response

This function calls the AI model to elicit a response


In [2]:
from az_client import call_ai, get_vector
from tqdm.notebook import tqdm
from db.docs import DocumentManager
from db.db_instance import DBClient
import json
import os
import shutil

client = DBClient()
db = DocumentManager()

with open('function.json', 'r') as f:
    function = json.load(f)

def extract_name_from_filename(filename):
    parts = filename.split('-')
    name_parts = parts[1:]
    name = ' '.join(name_parts).split('.')[0]
    return name

def add_to_json(data, filename):
    if not os.path.isfile('./data/processed_data.json'):
        with open('./data/processed_data.json', 'w') as f:
            json.dump([], f)    
    with open('./data/processed_data.json', 'r') as f:
        previous_data = json.load(f)
        previous_data.append({filename: data})    
    with open('./data/processed_data.json', 'w') as f:
        json.dump(previous_data, f)

def process_files_in_directory(directory, completed_directory):
    # Check if the 'completed' directory exists, if not, create it
    if not os.path.exists(completed_directory):
        os.makedirs(completed_directory)
    # Get a list of markdown files to process
    markdown_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith('.md')]
    # Initialize the progress bar
    for filename in tqdm(markdown_files, desc='Processing files'):
        filepath = os.path.join(directory, filename)
        try:
            with open(filepath, 'r') as file:
                submission = file.read()
            sub_author = extract_name_from_filename(filename)
            prompt = prompt_formatted(submission, sub_author)
            response = call_ai(prompt, function)
            
            response["author"] = sub_author
            response["file_name"] = filename.replace('.md', '')

            vector = get_vector(submission)

            db.new_doc(response, vector, True)
            add_to_json(response, filename.split('-')[0])
            # Move the processed file to the 'completed' directory
            completed_filepath = os.path.join(completed_directory, filename)
            shutil.move(filepath, completed_filepath)
        except Exception as e:
            print(f"Error processing file: {filename}")
            print(e)
            continue

# Example usage
directory = './data/files/converted'
completed_directory = './data/files/completed'
process_files_in_directory(directory, completed_directory)


Started /Users/k/.cache/weaviate-embedded: process ID 29309


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-05-08T10:58:33+10:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-05-08T10:58:33+10:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-05-08T10:58:33+10:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-05-08T10:58:33+10:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-05-08T10:58:33+10:00"}


Error setting up classes
string indices must be integers, not 'str'


Processing files:   0%|          | 0/2015 [00:00<?, ?it/s]

{"level":"info","msg":"Completed loading shard submission_ZLYAOU19n7NA in 28.722084ms","time":"2024-05-08T10:58:34+10:00"}
{"action":"hnsw_vector_cache_prefill","count":13250,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-05-08T10:58:34+10:00","took":16036000}


Error code: 400 - {'error': {'message': "This model's maximum context length is 131072 tokens. However, your messages resulted in 332816 tokens (332049 in the messages, 767 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Error processing file: 34751-anonymous.md
Error code: 400 - {'error': {'message': "This model's maximum context length is 131072 tokens. However, your messages resulted in 332816 tokens (332049 in the messages, 767 in the functions). Please reduce the length of the messages or functions.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
client initialized
{'substantive_submission': True, 'responder_category': 'Individual', 'support': 'oppose', 'motivations': ['Freedom of speech', 'Democratic values', 'Government overreach'], 'regulation': "The submission criticizes the bill for creating a discriminatory system t

