In [10]:
from pypdf import PdfReader
import os

directory = 'C:\\Users\\chris\\tenancy_tribunal\\PDFs'

In [11]:
parts = []

def visitor_body(text, cm, tm, font_dict, font_size):
    y = cm[5]

    if y > 100 and y < 800:
        parts.append(text)

In [12]:
# Load tribunal orders
tribunal_orders = []

for file in os.listdir(directory):
    reader = PdfReader(os.path.join(directory,file))

    for page in reader.pages[:-2]:
        page.extract_text(visitor_text=visitor_body)
    
    text_body = "".join(parts)

    tribunal_orders.append(text_body)

    parts = []

Data pre-processing functions

In [13]:
import re
import spacy
import json
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet') 
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

# Apply stemming to a set of documents
def apply_stemming(documents):
    result = []
    stemmer = PorterStemmer()

    for document in documents:
        tokens = nltk.word_tokenize(document)
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_document = ' '.join(stemmed_tokens)

        result.append(stemmed_document)

    return(result)


# Apply lemmtisation to a set of document
def apply_lemmatisation(documents):
    result = []
    lemmatiser = WordNetLemmatizer()

    for document in documents:
        tokens = nltk.word_tokenize(document)
        lemmatised_tokens = [lemmatiser.lemmatize(token) for token in tokens]
        lemmatised_document =  ' '.join(lemmatised_tokens)

        result.append(lemmatised_document)
    
    return(result)


# Apply entity filtering to a single document
def filter_entities(document, entities_to_filter):

    doc = nlp(document)
    cleaned_text = []

    for token in doc:
        if token.ent_type_ not in entities_to_filter:
            cleaned_text.append(token.text)
        
    filtered_document =  ' '.join(cleaned_text)

    
    return (filtered_document)

# Prepare data for doccano
def write_jsonl(file_path, documents):

    with open(file_path, 'w') as file:
        for doc in documents:
            document_dict = {"text": doc}  # Each string is treated as the 'text' field in a JSON object
            file.write(json.dumps(document_dict) + '\n')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
#pre-process text for tfidf
def preprocess_tfidf(documents):

    result = []
    entities_to_filter = ['PERSON', 'ORG', 'CARDINAL', 'MONEY', 'DATE', 'FAC', 'GPE', 'LOC']

    for document in documents:

        #Filter out entities
        document = filter_entities(document, entities_to_filter=entities_to_filter)

        # Push to lower text
        document = document.lower()

        # Remove numbers
        document = re.sub(r'\d+', '', document)

        # Remove names
        document = re.sub(r'\b[A-Z][a-z]+\b', '', document)

        # Remove punctuation and special characters
        document = re.sub(r'[^a-zA-Z\s]', '', document)

        result.append(document)

    #result = apply_stemming(result)
    result = apply_lemmatisation(result)
        
    return result

In [15]:
# Create pre-processed datasets for specific tasks
tribunal_orders_tfidf = preprocess_tfidf(tribunal_orders)
tribunal_orders_ner = tribunal_orders
tribunal_orders_cat = tribunal_orders
tribunal_orders_sum = tribunal_orders

# Create Processed Tribunal Order

Load, process and extract feeatures from tribunal orders. To do this we will compute the below for each document:
- Key terms (via TF-IDF)
- Named entities (via spaCy)
- Classify documents (spaCy, need training data for this)

TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

nltk.download('stopwords')

documents = tribunal_orders_tfidf

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Remove stop words
stop_words = list(stopwords.words('english'))

stop_words.append('tenant')
stop_words.append('landlord')
stop_words.append('tenancy')
stop_words.append('suppressed')

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary to store key terms for each document
key_terms_per_document = defaultdict(list)

# Extract top N key terms per document
num_key_terms = 5  # Number of key terms to extract per document

for i, document in enumerate(documents):
    feature_index = tfidf_matrix[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])

    # Sort terms by their TF-IDF score
    sorted_items = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

    # Extract top N key terms for each document
    for term_index, score in sorted_items[:num_key_terms]:
        key_terms_per_document[i].append(feature_names[term_index])

# Print key terms for each document
#for doc_index, terms in key_terms_per_document.items():
#    print(f"Key terms for Document {doc_index + 1}: {terms}")

Key terms for Document 1: ['cat', 'door', 'pane', 'glass', 'vacate']
Key terms for Document 2: ['behind', 'property', 'premise', 'left', 'helped']
Key terms for Document 3: ['damage', 'carpet', 'crack', 'vanity', 'ha']
Key terms for Document 4: ['rehearing', 'wa', 'application', 'allegation', 'matter']
Key terms for Document 5: ['basement', 'notice', 'quiet', 'retaliatory', 'wa']
Key terms for Document 6: ['damage', 'rent', 'stay', 'premise', 'window']
Key terms for Document 7: ['damage', 'doctor', 'insurance', 'see', 'rubbish']
Key terms for Document 8: ['bond', 'party', 'end', 'rent', 'filing']
Key terms for Document 9: ['bond', 'party', 'end', 'rent', 'filing']
Key terms for Document 10: ['party', 'irrigation', 'system', 'damage', 'harassed']
Key terms for Document 11: ['arrears', 'rate', 'nicola', 'water', 'bond']
Key terms for Document 12: ['owner', 'bond', 'wa', 'insulation', 'standard']
Key terms for Document 13: ['damage', 'rent', 'owned', 'property', 'arrears']
Key terms for D

In [17]:
# Save TF-IDF key terms for each tribunal order
tribunal_tfidf = key_terms_per_document
tribunal_tfidf

defaultdict(list,
            {0: ['cat', 'door', 'pane', 'glass', 'vacate'],
             1: ['behind', 'property', 'premise', 'left', 'helped'],
             2: ['damage', 'carpet', 'crack', 'vanity', 'ha'],
             3: ['rehearing', 'wa', 'application', 'allegation', 'matter'],
             4: ['basement', 'notice', 'quiet', 'retaliatory', 'wa'],
             5: ['damage', 'rent', 'stay', 'premise', 'window'],
             6: ['damage', 'doctor', 'insurance', 'see', 'rubbish'],
             7: ['bond', 'party', 'end', 'rent', 'filing'],
             8: ['bond', 'party', 'end', 'rent', 'filing'],
             9: ['party', 'irrigation', 'system', 'damage', 'harassed'],
             10: ['arrears', 'rate', 'nicola', 'water', 'bond'],
             11: ['owner', 'bond', 'wa', 'insulation', 'standard'],
             12: ['damage', 'rent', 'owned', 'property', 'arrears'],
             13: ['damage', 'rent', 'owned', 'property', 'arrears'],
             14: ['breach', 'papaiti', 'termin

Named entity Recognition

In [9]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Get the documents 
documents = tribunal_orders_ner

# Process the text using SpaCy
result = []

for document in documents:
    doc = nlp(document)
    result.append(doc)

In [10]:
# Save named entities to processed tribunal order
tribunal_named_entities = defaultdict(list)

for idx, doc in enumerate(result):
    tribunal_named_entities[idx] = doc.ents

Text classification

In [16]:
import spacy

# Load the saved model
path_to_saved_model = 'document_classification\\text_cat_model'

# Load the spaCy model
nlp = spacy.load(path_to_saved_model)

# Now, you can use the loaded model for various tasks
text = "The landlord entered the property."
doc = nlp(text)

In [21]:
from collections import defaultdict

result = []

for order in tribunal_orders_cat:
    doc = nlp(order)
    result.append(doc)

# Save named entities to processed tribunal order
tribunal_categories = defaultdict(list)

for idx, doc in enumerate(result):
    tribunal_categories[idx] = doc.cats

Summarise

In [27]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [37]:
result = []

for order in tribunal_orders_sum:

    # Sample text for summarization
    input_text = order

    # Tokenize input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(input_ids, max_length=500, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    result.append(summary)

In [38]:
# Save named entities to processed tribunal order
tribunal_summaries = defaultdict(list)

for idx, sum in enumerate(result):
    tribunal_summaries[idx] = sum

Combine

# Generate an answer

In [43]:
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [44]:
# Example data
questions = ["What is the capital of France?", "Who invented the internet?"]
answers = ["The capital of France is Paris.", "The internet was invented by Tim Berners-Lee."]
context = ["Paris is known for its beautiful architecture and rich history.", "Tim Berners-Lee, a computer scientist, is credited with the invention of the World Wide Web."]

# Combine questions, answers, and context
input_sequences = [f"question: {q} context: {c} answer: {a}" for q, c, a in zip(questions, context, answers)]

# Tokenization
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenized_input = tokenizer(input_sequences, return_tensors="tf", padding=True, truncation=True)

# Labels (target) - treat it as a text-to-text problem
tokenized_labels = tokenizer(answers, return_tensors="tf", padding=True, truncation=True)

# Model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Fine-tuning
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy")

# Example: Fine-tuning for a few epochs (you might need more depending on your dataset)
model.fit(tokenized_input, tokenized_labels["input_ids"], epochs=3, batch_size=2)

# Save the fine-tuned model for later use
model.save_pretrained("fine_tuned_t5_model")

ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
