In [1]:
from pypdf import PdfReader
import os
import pandas as pd

directory = 'C:\\Users\\chris\\tenancy_tribunal\\data\\tribunal_orders'

In [2]:
parts = []

def visitor_body(text, cm, tm, font_dict, font_size):
    y = cm[5]

    if y > 100 and y < 800:
        parts.append(text)

In [3]:
# Load tribunal orders
tribunal_orders = []

for file in os.listdir(directory):
    reader = PdfReader(os.path.join(directory,file))

    for page in reader.pages[:-2]:
        page.extract_text(visitor_text=visitor_body)
    
    text_body = "".join(parts)

    tribunal_orders.append(text_body)

    parts = []

Data pre-processing functions

In [4]:
import re
import spacy
import json
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet') 
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

# Apply stemming to a set of documents
def apply_stemming(documents):
    result = []
    stemmer = PorterStemmer()

    for document in documents:
        tokens = nltk.word_tokenize(document)
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_document = ' '.join(stemmed_tokens)

        result.append(stemmed_document)

    return(result)


# Apply lemmtisation to a set of document
def apply_lemmatisation(documents):
    result = []
    lemmatiser = WordNetLemmatizer()

    for document in documents:
        tokens = nltk.word_tokenize(document)
        lemmatised_tokens = [lemmatiser.lemmatize(token) for token in tokens]
        lemmatised_document =  ' '.join(lemmatised_tokens)

        result.append(lemmatised_document)
    
    return(result)


# Apply entity filtering to a single document
def filter_entities(document, entities_to_filter):

    doc = nlp(document)
    cleaned_text = []

    for token in doc:
        if token.ent_type_ not in entities_to_filter:
            cleaned_text.append(token.text)
        
    filtered_document =  ' '.join(cleaned_text)

    
    return (filtered_document)

# Prepare data for doccano
def write_jsonl(file_path, documents):

    with open(file_path, 'w') as file:
        for doc in documents:
            document_dict = {"text": doc}  # Each string is treated as the 'text' field in a JSON object
            file.write(json.dumps(document_dict) + '\n')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#pre-process text for tfidf
def preprocess_tfidf(documents):

    result = []
    entities_to_filter = ['PERSON', 'ORG', 'CARDINAL', 'MONEY', 'DATE', 'FAC', 'GPE', 'LOC']

    for document in documents:

        #Filter out entities
        document = filter_entities(document, entities_to_filter=entities_to_filter)

        # Push to lower text
        document = document.lower()

        # Remove numbers
        document = re.sub(r'\d+', '', document)

        # Remove names
        document = re.sub(r'\b[A-Z][a-z]+\b', '', document)

        # Remove punctuation and special characters
        document = re.sub(r'[^a-zA-Z\s]', '', document)

        result.append(document)

    #result = apply_stemming(result)
    result = apply_lemmatisation(result)
        
    return result

In [6]:
# Create pre-processed datasets for specific tasks
tribunal_orders_tfidf = preprocess_tfidf(tribunal_orders)
tribunal_orders_ner = tribunal_orders
tribunal_orders_cat = tribunal_orders
tribunal_orders_sum = tribunal_orders

In [7]:
# Save data
columns = ['tfidf', 'ner', 'cat', 'sum']
tribunal_orders = pd.DataFrame(columns={col: [] for col in columns})

tribunal_orders['tfidf'] = tribunal_orders_tfidf
tribunal_orders['ner'] = tribunal_orders_ner
tribunal_orders['cat'] = tribunal_orders_cat
tribunal_orders['sum'] = tribunal_orders_sum

In [8]:
tribunal_orders.to_csv('data\\tribunal_orders.csv', index=False)