In [7]:
import sys
import nltk
import numpy as np
import pandas as pd
from copy import deepcopy
import gensim
from gensim.models import Word2Vec
from gensim.models import Phrases
import re
from nltk.corpus import stopwords
from scipy.special import expit as sigmoid 


In [8]:

# 1 Fail
# 2 Success
# 8 Commons
# 9 Lords
#18 failed commons


nltk.download('stopwords')

 

def cleanFilesInFolder(startSession, endSession, house_and_status=[18, 19, 28, 29]):
    for code in house_and_status:
        file_name = f"CSVFilesMLP/{code}_{startSession} to {endSession}.csv"
        try:
            df = pd.read_csv(file_name)
            long_titles = df['Long Title'].astype(str)  
            output_file_name = f"cleanedText/{code}_{startSession} to {endSession}.txt"
            
            with open(output_file_name, "w", encoding="utf-8") as file:
                for title in long_titles:
                    modified_title = title.replace('A Bill to', '').replace('; and for connected purposes', '') 
                    modified_title = modified_title.lower()    
                    file.write(modified_title)
        except UnicodeEncodeError as e:
            print(f"Encoding error in file {file_name}: {e}")
        except Exception as e:
            print(f"An error occurred while processing {file_name}: {e}")



def splitSentences(text):
    
    sentences = re.split(r'\.\s+', text)
    stop_words = set(stopwords.words('english'))
    
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    filtered_sentences = []
    for sentence in sentences:
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        filtered_sentences.append(filtered_words)
        
    return filtered_sentences

def splitDocument(startSession, endSession, house_and_status = [18,19,28,29]):
    for code in house_and_status: 
        file_name = f"cleanedText/{code}_{startSession} to {endSession}.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    yield splitSentences(line)
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")
# returns a list of lists . List each sentnece, and inside there is a list of words for each sentence

def trainW2V(key, T=50):
    sentences = categorizedBillSentences[key]
    for epoch in range(T):
        print(f"{epoch}", end="")
        np.random.shuffle(sentences)
        models[key].train(sentences, total_examples=len(sentences), epochs=1)
        models[key].alpha *= 0.9
        models[key].min_alpha = models[key].alpha
    print(".")
    


def nearby(word, g):
    print(word)
    print(f"{g}:", end=" ")
    try:
        if word in models[g].wv.key_to_index:  
            for (w, v) in models[g].wv.most_similar([word]):
                print(w, end=" ")
        else:
            print("Word not in vocabulary!", end=" ")
    except Exception as e:
        print(f"Error: {e}")
    print("\n")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:





# input (sentence: list of words, model: gensim model, window: window= windowSize of word2vec, 
#debug: print intermediate calculations for debugging)

def score_sentence(sentence, model, window=5, debug=False):
    log_prob = 0.0 # total log prob for the sentence
    sentence_length = len(sentence)
    word_pair_probs = []  

    # Code for equation 1 
    for index, center_word in enumerate(sentence):
        if center_word not in model.wv:
            if debug:
                print(f"Center word '{center_word}' not in vocabulary.")
            continue
        center_vector = model.wv[center_word]

        start = max(0, index - window)
        end = min(sentence_length, index + window + 1)

        for j in range(start, end):
            if j == index:
                continue
            context_word = sentence[j]
            if context_word not in model.wv:
                if debug:
                    print(f"Context word '{context_word}' not in vocabulary.")
                continue
            context_vector = model.wv[context_word]

            dot_product = np.dot(center_vector, context_vector)
            prob = sigmoid(dot_product)

            word_pair_probs.append((center_word, context_word, prob))

            log_prob += np.log(prob + 1e-10)

    if debug:
        print("\n--- Word Pair Probabilities ---")
        for center, context, prob in word_pair_probs:
            print(f"p({context} | {center}) = {prob:.6f}")

    return log_prob




# Score an entire document (S sentences) under all models (Equation 2)
# input (sentencces:  a list of sentences ,models: the dictionary of models, window: the window size for score sentences)
# outpur: a sentences x categories (failed , succesful ....) with eahc sentence score according to score_sentence

def score_document(sentences, models, window=5):
    """
    Compute the score x category matrix of sentence scores for a document.
    
    sentences: list of sentences, each sentence is a list of words
    models: dict of {category: Word2Vec model}
    """
    S = len(sentences)
    C = len(models)
    
    sentence_scores = np.zeros((S, C))
    
    for s_idx, sentence in enumerate(sentences):
        for c_idx, (category, model) in enumerate(models.items()):
            sentence_scores[s_idx, c_idx] = score_sentence(sentence, model, window)
    
    return sentence_scores



# calculate document probabilities (Equation 5)

# input: the sxc array
# output: a 1x cateories array with the average score for all sentences in document 
def document_probabilities(sentence_scores):

    return sentence_scores.mean(axis=0)



# compute class probabilities ( Equation 3)

# input:  the array from document_probabilities
#ouput: normalized probabilities after bayes rule is applied #todo: change the priors to correspond to each class 
def class_probabilities(doc_probs):
    """
    Compute class probabilities using Bayes rule.
    Assuming uniform priors.
    """
    priors = np.ones(len(doc_probs)) / len(doc_probs)
    # bayes rule
    probs = (doc_probs * priors) / np.sum(doc_probs * priors)
    return probs



# classify the document (Equation 6)
# checks which of the numbers in the 1d array from document probabilities (the average across the classes ) is biggest and returns the index and array (for debuging) 
 
def classify_document(sentence_scores):
    doc_probs = document_probabilities(sentence_scores)
    predicted_class_idx = np.argmax(doc_probs)
    return predicted_class_idx, doc_probs



In [10]:

# Go from CSV files to 4 trained word2Vec models , 1 per category

startSession = 17
endSession= 39

# Convert from csv file to text file with just the long titles of the bills for all 4 files of te start-end session range
cleanFilesInFolder(startSession, endSession)

#combine sentences from all 4 files into 1 list of lists. Each sentence a list of words 
allSentences = [sentence for listOfSentneces in splitDocument(startSession, endSession) for sentence in listOfSentneces]

print("Sentences retreived")
print(len(allSentences))


# the catgeories are also used in scoring, the index for categories relates to this  
houseDictionary = {'FailedCommons': [18], 'FailedLords': [19], "SuccesCommons": [28], "SuccessLords":[29]}
[g for g in houseDictionary]



# populate a dictionary: key: category of bill, value: list of lists with sentences from bills of that category(FailedCommons, etc)
categorizedBillSentences = {key: [sentence for listOfSentneces in splitDocument(startSession, endSession, houseDictionary[key]) for sentence in listOfSentneces] for key in houseDictionary}

# creates series with how many sentnces for each catgory(debugging)
numberBills = pd.Series({key: len(categorizedBillSentences[key]) for key in houseDictionary}, dtype="float64" )

print("Sentences per category")
print(f"Sentences in FailedCommons {numberBills[0]}")
print(f"Sentences in FailedCommons {numberBills[1]}")
print(f"Sentences in FailedCommons {numberBills[2]}")
print(f"Sentences in FailedCommons {numberBills[3]}")


# populate houseDictionary with bills of appropaite category as lists of lists
for key in houseDictionary:
    for i in range(len(categorizedBillSentences[key])):
        categorizedBillSentences[key][i] = [word for word in categorizedBillSentences[key][i]]


#creates a dictionary of word2vec models initialized with the vocabulary of all sentences         


models = { }


for key in houseDictionary:
    models[key] = Word2Vec(allSentences, workers=4, hs=1, negative=0)
    models[key].build_vocab(allSentences) 
    
    

# trains each of the initialized models only with text from each specific category. 4 models trained with different bill text
for key in houseDictionary:
    print(key, end=":")
    trainW2V(key)


print(models.keys())



Read FilecleanedText/18_17 to 39.txt
Read FilecleanedText/19_17 to 39.txt
Read FilecleanedText/28_17 to 39.txt
Read FilecleanedText/29_17 to 39.txt
Sentences retreived
2830
Read FilecleanedText/18_17 to 39.txt
Read FilecleanedText/19_17 to 39.txt
Read FilecleanedText/28_17 to 39.txt
Read FilecleanedText/29_17 to 39.txt
Sentences per category
Sentences in FailedCommons 2019.0
Sentences in FailedCommons 282.0
Sentences in FailedCommons 448.0
Sentences in FailedCommons 81.0
FailedCommons:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
FailedLords:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
SuccesCommons:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
SuccessLords:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
dict_keys(['FailedCommons', 'FailedLords', 'SuccesCommons', 'SuccessLords'])


In [11]:
#Score sentences from the next three sessions(compared to the sessions the models were trained with) and evaluate how accurate the models are 

sentences = [
    "A Bill to require the Secretary of State to promote and secure youth services and provision of a requisite standard; to impose a duty on local authorities to provide youth services and establish local youth service partnerships with youth participation; and for connected purposes.",
    
    "A Bill to Authorise the use of resources for the year ending with 31 March 2020; to authorise both the issue of sums out of the Consolidated Fund and the application of income for that year; and to appropriate the supply authorised for that year by this Act and by the Supply and Appropriation (Anticipation and Adjustments) Act 2019.",
    
    "A Bill to make provision for unaccompanied asylum seeking children to receive legal advice and for extending the deadline for an unaccompanied asylum seeking child to appeal an asylum decision",
    
    "To confer powers upon New Southgate Cemetery and Crematorium Limited and the National Spiritual Assembly of the BahÃ¡'is of the United Kingdom to extinguish rights of burial and disturb human remains in respect of New Southgate Cemetery for the purpose of increasing the space for interments; and for connected purposes."
]



categories = list(models.keys())

for i, sentence_text in enumerate(sentences, 1):
    document = splitSentences(sentence_text)  
    sentence_scores = score_document(document, models, window=5)
    doc_probs = document_probabilities(sentence_scores)
    probs = class_probabilities(doc_probs)
    predicted_idx, doc_probs = classify_document(sentence_scores)

    print(f"\nSentence {i}:")
    print(f"Predicted class: {categories[predicted_idx]}")
    print(f"Document probabilities: {doc_probs}")
    print(f"Class probabilities: {probs}")


Sentence 1:
Predicted class: SuccesCommons
Document probabilities: [-82.41483574 -80.63811194 -71.21155376 -78.84841476]
Class probabilities: [0.26321123 0.25753684 0.2274309  0.25182102]

Sentence 2:
Predicted class: SuccesCommons
Document probabilities: [-98.81467435 -62.1529499  -12.55124981 -64.851167  ]
Class probabilities: [0.41454318 0.26074145 0.05265448 0.2720609 ]

Sentence 3:
Predicted class: FailedLords
Document probabilities: [-165.92863017   -2.96225362  -90.3188394   -83.54459483]
Class probabilities: [0.48410369 0.0086425  0.26350898 0.24374484]

Sentence 4:
Predicted class: SuccessLords
Document probabilities: [-38.38523628 -32.39724256 -38.09664073  -0.27078963]
Class probabilities: [0.35167447 0.2968142  0.34903044 0.0024809 ]


In [12]:
#Prepare data to test models. Split csv files into list of long titles per category


#Split a file into complete sentences. Files for success commons 28-30

def scoreSentences(listOfSentences, modelsList, predictedCategory):
    success= 0
    fail =0
    
    categories = list(modelsList.keys())
    
    for i, sentence_text in enumerate(listOfSentences, 1):
        document = splitSentences(sentence_text)
        sentence_scores = score_document(document, modelsList, window=5)
        doc_probs = document_probabilities(sentence_scores)
        probs = class_probabilities(doc_probs)
        predicted_idx, doc_probs = classify_document(sentence_scores)
        
       # print(f" Predicting Sentence {document}")
        #print(f"\nSentence {i}:")
        #print(f"Predicted class: {categories[predicted_idx]}")
       # print(f"Document probabilities: {doc_probs}")
        #print(f"Class probabilities: {probs}")
        
        if (categories[predicted_idx] == predictedCategory):
            success = success + 1
        else:
            fail = fail + 1
    total = fail + success
    if total > 0:
        accuracy = success / total
        print(f"Correct Prediction: {accuracy}")
    else:
        print("Error in predicition.")


startSessionTest = 35
endSessionTest = 37


def splitSentencesForTest(text):
    
    sentences = re.split(r'\.\s+', text)    
    return sentences


def splitDocumentTest(startSession, endSession, house_and_status):
    for code in house_and_status: 
        file_name = f"cleanedText/{code}_{startSession} to {endSession}.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    sentences = splitSentencesForTest(line.strip())
                    for sentence in sentences:
                        yield sentence 
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")



codes= [18,19,28,19]


testDictionary = {'FailedCommons': [18], 'FailedLords': [19], "SuccesCommons": [28], "SuccessLords":[29]}


for key, codes in testDictionary.items():
    testDictionary[key] = [sentence for sentence in splitDocumentTest(startSessionTest, endSessionTest, house_and_status=codes)]

#for key, sentences in testDictionary.items():
 #   print(f"{key}: {sentences[:5]}")
    
      


for key, code in testDictionary.items():
    testSentences = testDictionary[key]
    
    print (f" Number of sentences for {key} is {len(testSentences)}")
    print(f"The accuracy for {key} is ") 
    scoreSentences(testSentences, models, key)
    

# save the models 
from gensim.models import Word2Vec


for key, model in models.items():
    filename = f"{key}_word2vec_titles.model"
    model.save(filename)
    print(f"Model for {key} saved as {filename}")

    
# dict_keys(['FailedCommons', 'FailedLords', 'SuccesCommons', 'SuccessLords'])


Read FilecleanedText/18_35 to 37.txt
Read FilecleanedText/19_35 to 37.txt
Read FilecleanedText/28_35 to 37.txt
Read FilecleanedText/29_35 to 37.txt
 Number of sentences for FailedCommons is 617
The accuracy for FailedCommons is 


  ret, rcount, out=ret, casting='unsafe', subok=False)


Correct Prediction: 0.5186385737439222
 Number of sentences for FailedLords is 28
The accuracy for FailedLords is 
Correct Prediction: 0.8928571428571429
 Number of sentences for SuccesCommons is 201
The accuracy for SuccesCommons is 
Correct Prediction: 0.5124378109452736
 Number of sentences for SuccessLords is 8
The accuracy for SuccessLords is 
Correct Prediction: 0.75
Model for FailedCommons saved as FailedCommons_word2vec_titles.model
Model for FailedLords saved as FailedLords_word2vec_titles.model
Model for SuccesCommons saved as SuccesCommons_word2vec_titles.model
Model for SuccessLords saved as SuccessLords_word2vec_titles.model
