# Create Processed Tribunal Order

Load, process and extract feeatures from tribunal orders. To do this we will compute the below for each document:
- Key terms (via TF-IDF)
- Named entities (via spaCy)
- Classify documents (spaCy, need training data for this)

Load data

In [116]:
import pandas as pd

# Load data

tribunal_orders = pd.read_csv('data\\tribunal_orders.csv')

tribunal_orders_tfidf = list(tribunal_orders['tfidf'])
tribunal_orders_ner = list(tribunal_orders['ner'])
tribunal_orders_cat = list(tribunal_orders['cat'])
tribunal_orders_sum = list(tribunal_orders['sum'])

TF-IDF

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import pickle
import numpy as np

nltk.download('stopwords')

documents = tribunal_orders_tfidf

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
# Remove stop words
stop_words = list(stopwords.words('english'))

stop_words.append('tenant')
stop_words.append('landlord')
stop_words.append('tenancy')
stop_words.append('suppressed')

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Save the TF-IDF matrix to a file using pickle
with open("models\\tfidf\\tfidf_matrix.pkl", "wb") as matrix_file:
    pickle.dump(tfidf_matrix, matrix_file)

with open("models\\tfidf\\tfidf_vectorizer.pkl", "wb") as matrix_file:
    pickle.dump(tfidf_vectorizer, matrix_file)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary to store key terms for each document
key_terms_per_document = defaultdict(list)

# Extract top N key terms per document
num_key_terms = 5  # Number of key terms to extract per document

for i, document in enumerate(documents):
    feature_index = tfidf_matrix[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])

    # Sort terms by their TF-IDF score
    sorted_items = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

    # Extract top N key terms for each document
    for term_index, score in sorted_items[:num_key_terms]:
        key_terms_per_document[i].append(feature_names[term_index])

# Print key terms for each document
#for doc_index, terms in key_terms_per_document.items():
#    print(f"Key terms for Document {doc_index + 1}: {terms}")

In [92]:
# Save TF-IDF key terms for each tribunal order
tribunal_tfidf = key_terms_per_document

Named entity Recognition

In [93]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Get the documents 
documents = tribunal_orders_ner

# Process each document to extract named entities
named_entities_per_document = []
for document in documents:
    doc = nlp(document)
    named_entities = [ent.text for ent in doc.ents]
    named_entities_per_document.append(named_entities)

In [94]:
# Convert the list of named entities per document to strings
named_entities_documents = [' '.join(entities) for entities in named_entities_per_document]

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the documents with named entities
X = vectorizer.fit_transform(named_entities_documents)

In [95]:
# Save the vectorizer
with open("models\\ner_bow\\ner_bow_model.pkl", "wb") as matrix_file:
    pickle.dump(vectorizer, matrix_file)

# Save the matrix
with open("models\\ner_bow\\ner_bow_matrix.pkl", "wb") as matrix_file:
    pickle.dump(X, matrix_file)

In [101]:
# Save named entities to processed tribunal order
tribunal_named_entities = defaultdict(list)

for idx, doc in enumerate(named_entities_per_document):
    tribunal_named_entities[idx] = doc

Noun Phrases

In [103]:
# Get the documents 
documents = tribunal_orders_ner

# Process each document to extract named entities
noun_chunks_per_document = []
for document in documents:
    doc = nlp(document)
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]
    noun_chunks_per_document.append(noun_chunks)

In [104]:
# Convert the list of named entities per document to strings
noun_chunks_documents = [' '.join(entities) for entities in noun_chunks_per_document]

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the documents with named entities
X = vectorizer.fit_transform(noun_chunks_documents)

In [105]:
# Save the vectorizer
with open("models\\nph_bow\\nph_bow_model.pkl", "wb") as matrix_file:
    pickle.dump(vectorizer, matrix_file)

# Save the matrix
with open("models\\nph_bow\\nph_bow_matrix.pkl", "wb") as matrix_file:
    pickle.dump(X, matrix_file)

In [106]:
# Save named entities to processed tribunal order
tribunal_noun_chunks = defaultdict(list)

for idx, doc in enumerate(noun_chunks_per_document):
    tribunal_noun_chunks[idx] = doc

Text classification

In [107]:
import spacy

# Load the saved model
path_to_saved_model = 'models\\tribunal_order_classification_model\\text_cat_model'

# Load the spaCy model
nlp = spacy.load(path_to_saved_model)

# Now, you can use the loaded model for various tasks
text = "The landlord entered the property."
doc = nlp(text)

In [108]:
from collections import defaultdict

result = []

for order in tribunal_orders_cat:
    doc = nlp(order)
    result.append(doc)

# Save named entities to processed tribunal order
tribunal_categories = defaultdict(list)

for idx, doc in enumerate(result):
    tribunal_categories[idx] = doc.cats

Summarise

In [109]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [110]:
result = []

for order in tribunal_orders_sum:

    # Sample text for summarization
    input_text = order

    # Tokenize input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model.generate(input_ids, max_length=500, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    result.append(summary)

In [111]:
# Save named entities to processed tribunal order
tribunal_summaries = defaultdict(list)

for idx, sum in enumerate(result):
    tribunal_summaries[idx] = sum

Combine

In [112]:
import pandas as pd

# Initialize an empty DataFrame with specific columns
columns = ['text','summary', 'key_terms', 'named_entities', 'categories', 'noun_chunks']
processed_orders = pd.DataFrame(columns={col: [] for col in columns})

processed_orders['text'] = tribunal_orders['sum']
processed_orders['summary'] = tribunal_summaries
processed_orders['key_terms'] = tribunal_tfidf
processed_orders['named_entities'] = tribunal_named_entities
processed_orders['categories'] = tribunal_categories
processed_orders['noun_chunks'] = tribunal_noun_chunks

In [113]:
processed_orders.to_csv('processed_orders.csv', index=False)

Thoughts for improvement:
- For summarising remove the superfluous sentences so that they don't dominate the summary. I am most interested in the 'nature' of the case as opposed to the outcomes because I want people to be able to get tribunal orders that match their circumstances. 
- Remove numbers for sentences (doesn't help with understanding but can be used to identify the key sentences)
- Tidy up format in which data in processed order is stored
- Remove dates
- Remove frequent acronyms