In [14]:
import os
import pandas as pd
from pypdf import PdfReader
import re
from sklearn.model_selection import train_test_split

## Load the data

In [15]:
# open and read the CSV file to get the location of the pdf files
csv_path = 'metadata.csv'
df = pd.read_csv(csv_path)

In [16]:
# create directory to save TXT files
output_dir = './plaintext'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

## Extract text from PDF files

In [17]:
def clean_text(text):
    # delete the PDF identification
    text = re.sub(r'^DRN-\d+\n', '', text)
    # change "’" to "'"
    text = re.sub(r'’', "'", text)
    # delete the non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # add a space after each period
    text = re.sub(r'\.([^\s])', r'. \1', text)
    # delete the extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # add new lines before specific keywords
    text = text.replace("The complaint ", 'The complaint\n')
    text = text.replace("What happened ", '\nWhat happened\n')
    text = text.replace("What I've decided and why ", "\nWhat I've decided and why\n")
    
    # remove "My final decision" and everything after it
    text = re.sub(r'My final decision.*', '', text, flags=re.DOTALL)
    
    return text

In [18]:
# iterate through each row to get the PDF file path from 'location' column
for index, row in df.iterrows():
    pdf_path = row['location']

    # extract text from the PDF file
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        
        # clean the text
        text = clean_text(text)

        # create a plaintext file with the same name as the PDF file
        txt_filename = os.path.basename(pdf_path).replace('.pdf', '.txt')
        txt_path = os.path.join(output_dir, txt_filename)
        
        # write the text to the file
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text)
    
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")

In [19]:
# crete a new column in the dataframe to store the path of the plaintext files
fn = lambda x: os.path.join(output_dir[2:], os.path.basename(x).replace('.pdf', '.txt'))
df['plaintext_path'] = df['location'].apply(fn)

## Split the data into train and test sets

In [20]:
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df['decision'], random_state=42)
# add a new column 'set' to indicate whether the sample is in the training or testing set
train_data['set'] = 'train'
test_data['set'] = 'test'
# combine the training and testing data back into one dataframe
df = pd.concat([train_data, test_data])

## Copy the plaintext files to the training and testing directories

In [21]:
import shutil

# read the metadata file
df = pd.read_csv(csv_path)

# define base directory
base_dir = './data'

# define decision mapping
decision_mapping = {
    'Upheld': 'positive',
    'Not upheld': 'negative'
}

# function to create directories if they don't exist
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

# iterate over the metadata and copy files to the appropriate directories
for _, row in df.iterrows():
    decision = decision_mapping.get(row['decision'], 'unknown')
    dataset = row['set']
    plaintext_path = row['plaintext_path']
    
    # create the target directory path
    target_dir = os.path.join(base_dir, dataset, decision)
    create_directory(target_dir)
    
    # define source and target file paths
    source_path = os.path.join('./', plaintext_path)
    target_path = os.path.join(target_dir, os.path.basename(plaintext_path))
    
    # copy the file to the target directory
    if os.path.exists(source_path):
        shutil.copy(source_path, target_path)
    else:
        print(f"source file not found: {source_path}")

print("files have been organized successfully.")

files have been organized successfully.


In [22]:
# save the updated dataframe to a CSV file
df.to_csv(csv_path, index=False)