# Process list of documents before analysis


## Checking for duplicates

The record for how duplicates are handled in a spreadsheet containing all
submissions. This is downloaded and saved to JSON format in next steps.


### Remove duplicates via comparing doc hashes


In [None]:
import hashlib
import os

def hash_file(filepath):
    """Calculate SHA256 hash of a file."""
    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        # Read and update hash in chunks of 4K
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def find_duplicate_pdfs(folder_path):
    """Find duplicate PDFs in a folder."""
    hashes = {}
    duplicates = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            filepath = os.path.join(folder_path, filename)
            file_hash = hash_file(filepath)
            
            if file_hash in hashes:
                duplicates.append((filename, hashes[file_hash]))
            else:
                hashes[file_hash] = filename
    
    return duplicates

folder_path = './static/submissions'
duplicates = find_duplicate_pdfs(folder_path)

if duplicates:
    print("Found duplicates:")
    for dup in duplicates:
        print(f"{dup[0]} is a duplicate of {dup[1]}")
else:
    print("No duplicates found.")


### List documents from same author for checking


In [None]:
import os
import shutil

def extract_name_from_filename(filename):
    parts = filename.split('-')
    name_parts = parts[1:]
    name = ' '.join(name_parts).split('.')[0]
    if name.find('anonymous') != -1:
        name = 'anonymous'
    return name.lower()

def find_duplicate_names(folder_path):
    name_to_files = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            name = extract_name_from_filename(filename)
            if name != 'anonymous':
                if name in name_to_files:
                    name_to_files[name].append(filename)
                else:
                    name_to_files[name] = [filename]
    
    duplicates = {name: files for name, files in name_to_files.items() if len(files) > 1}
    return duplicates

def copy_files_to_subfolders(folder_path, duplicates):
    # Create a main directory for the duplicates
    duplicates_dir = os.path.join(folder_path, 'duplicates')
    if not os.path.exists(duplicates_dir):
        os.makedirs(duplicates_dir)
    
    for name, files in duplicates.items():
        # Create a subfolder for each duplicate name
        subfolder_path = os.path.join(duplicates_dir, name)
        if not os.path.exists(subfolder_path):
            os.makedirs(subfolder_path)
        
        # Copy the files into their respective subfolders
        for file in files:
            src_path = os.path.join(folder_path, file)
            dst_path = os.path.join(subfolder_path, file)
            shutil.copy(src_path, dst_path)

folder_path = './static/submissions'
duplicates = find_duplicate_names(folder_path)

# Copy the files to subfolders
copy_files_to_subfolders(folder_path, duplicates)

print(f'Copied files for {len(duplicates)} duplicate names into subfolders.')

### Check for duplicates by comparing chunks within submissions for overlap


In [None]:
import os
import shutil
from collections import defaultdict
import hashlib
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def chunk_text(text, chunk_size=500):
    """Divide text into chunks of a given size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def hash_chunk(chunk):
    """Calculate SHA256 hash of a text chunk."""
    return hashlib.sha256(chunk.encode('utf-8')).hexdigest()

def find_related_pdfs(folder_path, chunk_size=500, threshold=0.1):
    """Find PDFs that share chunks of text."""
    chunk_hashes = {}
    doc_chunks = {}
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            filepath = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(filepath)
            chunks = chunk_text(text, chunk_size)
            doc_chunks[filename] = set()
            
            for chunk in chunks:
                chunk_hash = hash_chunk(chunk)
                doc_chunks[filename].add(chunk_hash)
                
                if chunk_hash not in chunk_hashes:
                    chunk_hashes[chunk_hash] = []
                chunk_hashes[chunk_hash].append(filename)
    
    # Identify documents with a significant number of shared chunks
    related_docs = []
    for doc, hashes in doc_chunks.items():
        shared = {}
        for hash_val in hashes:
            for other_doc in chunk_hashes[hash_val]:
                if other_doc != doc:
                    shared[other_doc] = shared.get(other_doc, 0) + 1
        
        # Check if the number of shared chunks exceeds the threshold
        for other_doc, count in shared.items():
            if count / len(hashes) > threshold:
                related_docs.append((doc, other_doc, count))
    
    return related_docs

def merge_groups_with_shared_documents(related_pdfs):
    """Merge groups that share any documents."""
    groups = []
    for rel in related_pdfs:
        found = False
        for group in groups:
            if rel[0] in group or rel[1] in group:
                group.update([rel[0], rel[1]])
                found = True
                break
        if not found:
            groups.append(set([rel[0], rel[1]]))
    
    # Merge groups with any common elements
    merged = True
    while merged:
        merged = False
        for i in range(len(groups)):
            for j in range(i+1, len(groups)):
                if groups[i].intersection(groups[j]):
                    groups[i] = groups[i].union(groups[j])
                    groups[j] = set()
                    merged = True
        groups = [group for group in groups if group]
    
    return groups

def create_subfolders_for_duplicates(related_pdfs, folder_path, duplicates_folder="01. duplicates_chunked"):
    """Create subfolders and move duplicates into them, merging groups with shared documents."""
    groups = merge_groups_with_shared_documents(related_pdfs)
    duplicates_path = os.path.join(folder_path, duplicates_folder)
    
    if not os.path.exists(duplicates_path):
        os.mkdir(duplicates_path)
    
    # Create subfolders and move/copy PDFs
    for i, group in enumerate(groups):
        subfolder_path = os.path.join(duplicates_path, f"group_{i+1}")
        if not os.path.exists(subfolder_path):
            os.mkdir(subfolder_path)
        
        for file in group:
            src_path = os.path.join(folder_path, file)
            dest_path = os.path.join(subfolder_path, file)
            shutil.copy(src_path, dest_path)  # Use shutil.move(src_path, dest_path) to move instead of copy

folder_path = './static/submissions'
related_pdfs = find_related_pdfs(folder_path)

if related_pdfs:
    print(f"Found {len(related_pdfs)} related PDFs based on shared text chunks:")    
    create_subfolders_for_duplicates(related_pdfs, folder_path)
    print("Duplicates have been organized into subfolders.")

else:
    print("No related PDFs found.")


## Loading spreadsheet

The spreadsheet includes final decisions on how data is being labelled,
including categories and whether any items are being removed (i.e. due to being
duplicates etc.)

This first step is to prepare the data into the correct OpenAI batch format. The
batch will run (up to 24hrs) and we will review the results for consistency.
Special attention will be paid in this first run to determine if the AI has
categorised any submissions (not manually categorised) as part of our interested
groupings. If so, we will manually review to determine if correct or not.

The second step will then take all those within the special groups and run
additional requests on them.


In [None]:
import json
import pandas as pd
import json

# Load the Excel spreadsheet into a pandas DataFrame
df = pd.read_excel('./data/list.xlsx')

# Convert the DataFrame to a list of dictionaries
data = df.to_dict(orient='records')

def extract_name_from_filename(filename):
    parts = filename.split('-')
    name_parts = parts[1:]
    name = ' '.join(name_parts).split('.')[0]
    if name.find('anonymous') != -1:
        name = 'anonymous'
    return name.lower()

formatted_data = []
# Convert empty cells in 'Group', 'Comments', and 'Removed (Y)' columns to None
for row in data:
    if pd.isnull(row['Group']):
        row['Group'] = None
    if pd.isnull(row['Comments']):
        row['Comments'] = None
    if pd.isnull(row['Removed (Y)']):
        row['Removed (Y)'] = None

    file_name = row['doc'].replace('acma2023-', '').replace('.pdf', '')
    
    formatted_row = {
        'uniqueId': row['UniqueID'],
        'group': row['Group'],
        'submitter': extract_name_from_filename(file_name),
        'doc': file_name,
        "metadata": {
            "groupDefinedBy": "human" if row['Group'] else "AI",
            "removed": row['Removed (Y)'],
            "comments": row['Comments']            
        }
    }
    formatted_data.append(formatted_row)

# Save the data as a JSON file if it doesn't exist
json_file = './data/list.json'
with open(json_file, 'w') as f:
    json.dump(formatted_data, f)

## We now have JSON of values in the below form, lets create JSONl files for batch processing

```json
{
  "group": "string | null",
  "submitter": "string",
  "doc": "string",
  "uniqueId": "string",
  "metadata": {
    "groupDefinedBy": "human or AI",
    "removed": "string | null",
    "comments": "string | null"
  }
}
```

These are the unique groups:

`defined_group = ['academic', 'civil', '???', 'government', 'industry', 'news', 'platform', 'political']`

This still contains all entries, even ones that are removed. The next step will
ignore removed entries.


### First run of batch request, preliminary function across all submissions

After this, we will review how the AI has grouped each submission that has not
already been manually classified.


In [None]:
import os
import json

# Define the prompt for each individual request
def prompt_formatted(submission_string: str, submission_author: str) -> str:    
    # Read the first file and set a string variable
    with open('prompt.txt', 'r') as file:
        prompt = file.read()
        
    with open('prompt_issues.md', 'r') as file:
        issues = file.read()

    with open('prompt_guidance_note.md', 'r') as file:
        guidance_note = file.read()

    with open('prompt_fact_sheet.md', 'r') as file:
        fact_sheet = file.read()

    prompt = prompt.replace('|issues|', issues)
    prompt = prompt.replace('|guidance_note|', guidance_note)
    prompt = prompt.replace('|fact_sheet|', fact_sheet)

    prompt += "\n\n***************************************** SUBMISSION START *****************************************\n\n"

    prompt += f"Submission from: {submission_author}\n\n"
    
    prompt += submission_string

    prompt += "\n\n***************************************** SUBMISSION END *****************************************\n\n"

    return prompt


    if (category == 'civil' or category == 'academic'):
        with open('function_civil_society.json', 'r') as f:
            return json.load(f)
    if (category == 'platform'):
        with open('function_digital.json', 'r') as f:
            return json.load(f)
    if (category == 'news'):
        with open('function_news.json', 'r') as f:
            return json.load(f) 
    if (category == 'government' or category == 'political'):
        with open('function_pol.json', 'r') as f:
            return json.load(f) 

def get_function():
    with open('function.json', 'r') as f:
        function = json.load(f)
    return function 

with open('./data/list.json', 'r') as f:
    list = json.load(f)

# md_file_location = './data/files/md_files'
md_file_location = './data/files/converted'

file_counter = 0
jsonl_file = f"./data/all/jsonl_{file_counter}.jsonl"

for i in list:
    if i["metadata"]["removed"] == "Y":
        continue
    try:
        # md_file_path = f"{md_file_location}/{i["doc"]}/{i["doc"]}.md"
        md_file_path = f"{md_file_location}/{i["doc"]}.md"
        with open(md_file_path, 'r') as file:
            submission = file.read()
        sub_author = i["submitter"]
        prompt = prompt_formatted(submission, sub_author)
        function = get_function()
        ldata = {"custom_id": i["uniqueId"], "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-2024-05-13", "messages": [{"role": "user", "content": prompt}],"max_tokens": 4096,"temperature": 0, "tools":[function], "tool_choice":{ 'type': 'function', 'function': { 'name': 'submission_eval' } }}}        
        
        if os.path.exists(jsonl_file) and os.path.getsize(jsonl_file) >= 85 * 1024 * 1024:  # 90MB
            file_counter += 1
            jsonl_file = f"./data/all/jsonl_{file_counter}.jsonl"
        
        with open(jsonl_file, 'a') as f:
            json.dump(ldata, f)
            f.write('\n')

    except Exception as e:
        print(e)    
        continue

#### We should now have a folder with all the prepared batch calls

We will upload each of these files to OpenAI and then call the batch function
for each


In [None]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv('OPENAI_KEY'),max_retries=3)

jsonl_dir = './data/all'

jsonl_files = [f for f in os.listdir(jsonl_dir) if os.path.isfile(os.path.join(jsonl_dir, f)) and f.endswith('.jsonl')]

file_ids = []

for file in jsonl_files:
    file_object = client.files.create(
        file=open(f"{jsonl_dir}/{file}", "rb"),
        purpose="batch"
    )
    file_ids.append(file_object.id)

# We have now uploaded all the files and have their IDs, lets create a batch job for each

batch_ids = []

for file_id in file_ids:
    job = client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
          )
    print(job)
    batch_ids.append(job.id)

#### The batch processes should now be completing, they will take up to 24hrs

We can run the following cell to check on process


In [None]:
from openai import OpenAI
client = OpenAI(api_key=os.getenv('OPENAI_KEY'),max_retries=3)

batch_jobs = client.batches.list()

print(f'Total number of batch jobs: {len(batch_jobs.data)}')
for batch in batch_jobs.data:
    print(batch.id, batch.status, batch.request_counts)