In [24]:
import sys
import nltk
import numpy as np
import pandas as pd
from copy import deepcopy
import gensim
from gensim.models import Word2Vec
from gensim.models import Phrases
import re
from nltk.corpus import stopwords
from scipy.special import expit as sigmoid 


In [25]:

# 1 Fail
# 2 Success
# 8 Commons
# 9 Lords
#18 failed commons


nltk.download('stopwords')

 

def cleanFilesInFolder(startSession, endSession, house_and_status=[18, 19, 28, 29]):
    for code in house_and_status:
        file_name = f"CSVFilesMLP/{code}_{startSession} to {endSession}.csv"
        try:
            df = pd.read_csv(file_name)
            long_titles = df['Long Title'].astype(str)  
            output_file_name = f"cleanedText/{code}_{startSession} to {endSession}.txt"
            
            with open(output_file_name, "w", encoding="utf-8") as file:
                for title in long_titles:
                    modified_title = title.replace('A Bill to', '').replace('; and for connected purposes', '') 
                    modified_title = modified_title.lower()    
                    file.write(modified_title)
        except UnicodeEncodeError as e:
            print(f"Encoding error in file {file_name}: {e}")
        except Exception as e:
            print(f"An error occurred while processing {file_name}: {e}")



def splitSentences(text):
    
    sentences = re.split(r'\.\s+', text)
    stop_words = set(stopwords.words('english'))
    
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    filtered_sentences = []
    for sentence in sentences:
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        filtered_sentences.append(filtered_words)
        
    return filtered_sentences


def splitDocument(house_and_status = [18,19,28,29]):
    for code in house_and_status: 
        file_name = f"cleanedText/{code}/Training/training_text{code}.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    yield splitSentences(line)
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")

def splitDocumentAll(startSession, endSession, house_and_status = [18,19,28,29]):
    for code in house_and_status: 
        file_name = f"cleanedText/{code}_{startSession} to {endSession}.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    yield splitSentences(line)
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")
# returns a list of lists . List each sentnece, and inside there is a list of words for each sentence

def trainW2V(key, T=50):
    sentences = categorizedBillSentences[key]
    for epoch in range(T):
        print(f"{epoch}", end="")
        np.random.shuffle(sentences)
        models[key].train(sentences, total_examples=len(sentences), epochs=1)
        models[key].alpha *= 0.9
        models[key].min_alpha = models[key].alpha
    print(".")
    


def nearby(word, g):
    print(word)
    print(f"{g}:", end=" ")
    try:
        if word in models[g].wv.key_to_index:  
            for (w, v) in models[g].wv.most_similar([word]):
                print(w, end=" ")
        else:
            print("Word not in vocabulary!", end=" ")
    except Exception as e:
        print(f"Error: {e}")
    print("\n")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:





# input (sentence: list of words, model: gensim model, window: window= windowSize of word2vec, 
#debug: print intermediate calculations for debugging)

def score_sentence(sentence, model, window=5, debug=False):
    log_prob = 0.0 # total log prob for the sentence
    sentence_length = len(sentence)
    word_pair_probs = []  

    # Code for equation 1 
    for index, center_word in enumerate(sentence):
        if center_word not in model.wv:
            if debug:
                print(f"Center word '{center_word}' not in vocabulary.")
            continue
        center_vector = model.wv[center_word]

        start = max(0, index - window)
        end = min(sentence_length, index + window + 1)

        for j in range(start, end):
            if j == index:
                continue
            context_word = sentence[j]
            if context_word not in model.wv:
                if debug:
                    print(f"Context word '{context_word}' not in vocabulary.")
                continue
            context_vector = model.wv[context_word]

            dot_product = np.dot(center_vector, context_vector)
            prob = sigmoid(dot_product)

            word_pair_probs.append((center_word, context_word, prob))

            log_prob += np.log(prob + 1e-10)

    if debug:
        print("\n--- Word Pair Probabilities ---")
        for center, context, prob in word_pair_probs:
            print(f"p({context} | {center}) = {prob:.6f}")

    return log_prob




# Score an entire document (S sentences) under all models (Equation 2)
# input (sentencces:  a list of sentences ,models: the dictionary of models, window: the window size for score sentences)
# outpur: a sentences x categories (failed , succesful ....) with eahc sentence score according to score_sentence

def score_document(sentences, models, window=5):
    """
    Compute the score x category matrix of sentence scores for a document.
    
    sentences: list of sentences, each sentence is a list of words
    models: dict of {category: Word2Vec model}
    """
    S = len(sentences)
    C = len(models)
    
    sentence_scores = np.zeros((S, C))
    
    for s_idx, sentence in enumerate(sentences):
        for c_idx, (category, model) in enumerate(models.items()):
            sentence_scores[s_idx, c_idx] = score_sentence(sentence, model, window)
    
    return sentence_scores



# calculate document probabilities (Equation 5)

# input: the sxc array
# output: a 1x cateories array with the average score for all sentences in document 
def document_probabilities(sentence_scores):

    return sentence_scores.mean(axis=0)



# compute class probabilities ( Equation 3)

# input:  the array from document_probabilities
#ouput: normalized probabilities after bayes rule is applied #todo: change the priors to correspond to each class 
def class_probabilities(log_doc_probs):

    num_classes = len(log_doc_probs)
    doc_probs = np.exp(log_doc_probs - np.max(log_doc_probs)) 
    
    priors = np.ones(num_classes) / num_classes

    numerator = doc_probs * priors
    denominator = np.sum(numerator)
    probs = numerator / denominator
    
    return probs



# classify the document (Equation 6)
# checks which of the numbers in the 1d array from document probabilities (the average across the classes ) is biggest and returns the index and array (for debuging) 
 
def classify_document(sentence_scores):
    doc_probs = document_probabilities(sentence_scores)
    class_probs = class_probabilities(doc_probs)
    predicted_class_idx = np.argmax(class_probs)
    return predicted_class_idx, class_probs


In [None]:
#Split Into validation and 
def splitIntoSets(code):
    csv_path = f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}_17 to 39_withTitleScores.csv"

    df = pd.read_csv(csv_path)

    total_rows = len(df)
    testingSetSize = round(0.1 * total_rows)
    remainingSize = total_rows - testingSetSize


    testingSet = df.iloc[:testingSetSize]['Bill Id'].tolist()


    remaining_df = df.iloc[testingSetSize:].sample(frac=1, random_state=42) 
    
    validationSet = remaining_df.iloc[:testingSetSize]['Bill Id'].tolist()


    trainingSet = remaining_df.iloc[testingSetSize:]['Bill Id'].tolist()
    
    return trainingSet, testingSet, validationSet

remaining = [28, 29, 18, 19]

for code in remaining :
    trainingIndexes, testingIndexes, validationIndexes = splitIntoSets(code)
    df = pd.read_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}_17 to 39_withTitleScores.csv")


    df_training = df[df['Bill Id'].isin(trainingIndexes)]
    df_testing = df[df['Bill Id'].isin(testingIndexes)]
    df_validation = df[df['Bill Id'].isin(validationIndexes)]
    df_training.to_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}\\Training\\{code}_training.csv", index=False)
    df_testing.to_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}\\Testing\\{code}_testing.csv", index=False)
    df_validation.to_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}\\Validation\\{code}_validation.csv", index=False)
    
    

In [27]:

# Go from CSV files to 4 trained word2Vec models , 1 per category

startSession = 17
endSession= 39

# Convert from csv file to text file with just the long titles of the bills for all 4 files of te start-end session range
cleanFilesInFolder(startSession, endSession)

#combine sentences from all 4 files into 1 list of lists. Each sentence a list of words 
allSentences = [sentence for listOfSentneces in splitDocumentAll(startSession, endSession) for sentence in listOfSentneces]

print("Sentences retreived")
print(len(allSentences))


# the catgeories are also used in scoring, the index for categories relates to this  
houseDictionary = {'FailedCommons': [18], 'FailedLords': [19], "SuccesCommons": [28], "SuccessLords":[29]}
[g for g in houseDictionary]



# populate a dictionary: key: category of bill, value: list of lists with sentences from bills of that category(FailedCommons, etc)
categorizedBillSentences = {key: [sentence for listOfSentneces in splitDocument(startSession, endSession, houseDictionary[key]) for sentence in listOfSentneces] for key in houseDictionary}

# creates series with how many sentnces for each catgory(debugging)
numberBills = pd.Series({key: len(categorizedBillSentences[key]) for key in houseDictionary}, dtype="float64" )

print("Sentences per category")
print(f"Sentences in FailedCommons {numberBills[0]}")
print(f"Sentences in FailedCommons {numberBills[1]}")
print(f"Sentences in FailedCommons {numberBills[2]}")
print(f"Sentences in FailedCommons {numberBills[3]}")


# populate houseDictionary with bills of appropaite category as lists of lists
for key in houseDictionary:
    for i in range(len(categorizedBillSentences[key])):
        categorizedBillSentences[key][i] = [word for word in categorizedBillSentences[key][i]]


#creates a dictionary of word2vec models initialized with the vocabulary of all sentences         


models = { }


for key in houseDictionary:
    models[key] = Word2Vec(allSentences, workers=4, hs=1, negative=0)
    models[key].build_vocab(allSentences) 
    
    

# trains each of the initialized models only with text from each specific category. 4 models trained with different bill text
for key in houseDictionary:
    print(key, end=":")
    trainW2V(key)


print(models.keys())



Read FilecleanedText/18_17 to 39.txt
Read FilecleanedText/19_17 to 39.txt
Read FilecleanedText/28_17 to 39.txt
Read FilecleanedText/29_17 to 39.txt
Sentences retreived
2830
Read FilecleanedText/18_17 to 39.txt
Read FilecleanedText/19_17 to 39.txt
Read FilecleanedText/28_17 to 39.txt
Read FilecleanedText/29_17 to 39.txt
Sentences per category
Sentences in FailedCommons 2019.0
Sentences in FailedCommons 282.0
Sentences in FailedCommons 448.0
Sentences in FailedCommons 81.0
FailedCommons:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
FailedLords:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
SuccesCommons:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
SuccessLords:012345678910111213141516171819202122232425262728293031323334353637383940414243444546474849.
dict_keys(['FailedCommons', 'FailedLords', 'SuccesCommons', 'SuccessLords'])


In [28]:
#Score sentences from the next three sessions(compared to the sessions the models were trained with) and evaluate how accurate the models are 

sentences = [
    "A Bill to require the Secretary of State to promote and secure youth services and provision of a requisite standard; to impose a duty on local authorities to provide youth services and establish local youth service partnerships with youth participation; and for connected purposes.",
    
    "A Bill to Authorise the use of resources for the year ending with 31 March 2020; to authorise both the issue of sums out of the Consolidated Fund and the application of income for that year; and to appropriate the supply authorised for that year by this Act and by the Supply and Appropriation (Anticipation and Adjustments) Act 2019.",
    
    "A Bill to make provision for unaccompanied asylum seeking children to receive legal advice and for extending the deadline for an unaccompanied asylum seeking child to appeal an asylum decision",
    
    "To confer powers upon New Southgate Cemetery and Crematorium Limited and the National Spiritual Assembly of the BahÃ¡'is of the United Kingdom to extinguish rights of burial and disturb human remains in respect of New Southgate Cemetery for the purpose of increasing the space for interments; and for connected purposes."
]



categories = list(models.keys())

for i, sentence_text in enumerate(sentences, 1):
    document = splitSentences(sentence_text)  
    sentence_scores = score_document(document, models, window=5)
    doc_probs = document_probabilities(sentence_scores)
    probs = class_probabilities(doc_probs)
    predicted_idx, doc_probs = classify_document(sentence_scores)

    print(f"\nSentence {i}:")
    print(f"Predicted class: {categories[predicted_idx]}")
    print(f"Document probabilities: {doc_probs}")
    print(f"Class probabilities: {probs}")


Sentence 1:
Predicted class: FailedCommons
Document probabilities: [9.26957648e-01 3.66640828e-23 7.30423519e-02 4.65472102e-13]
Class probabilities: [9.26957648e-01 3.66640828e-23 7.30423519e-02 4.65472102e-13]

Sentence 2:
Predicted class: SuccesCommons
Document probabilities: [3.62247042e-39 4.47187653e-22 1.00000000e+00 1.33229280e-22]
Class probabilities: [3.62247042e-39 4.47187653e-22 1.00000000e+00 1.33229280e-22]

Sentence 3:
Predicted class: FailedLords
Document probabilities: [2.41413224e-86 1.00000000e+00 6.64246535e-37 8.61886923e-37]
Class probabilities: [2.41413224e-86 1.00000000e+00 6.64246535e-37 8.61886923e-37]

Sentence 4:
Predicted class: SuccessLords
Document probabilities: [3.35402341e-19 2.83840187e-15 5.79228968e-18 1.00000000e+00]
Class probabilities: [3.35402341e-19 2.83840187e-15 5.79228968e-18 1.00000000e+00]


In [29]:
#Prepare data to test models. Split csv files into list of long titles per category


#Split a file into complete sentences. Files for success commons 28-30

def scoreSentences(listOfSentences, modelsList, predictedCategory):
    success= 0
    fail =0
    
    categories = list(modelsList.keys())
    
    for i, sentence_text in enumerate(listOfSentences, 1):
        document = splitSentences(sentence_text)
        sentence_scores = score_document(document, modelsList, window=5)
        doc_probs = document_probabilities(sentence_scores)
        probs = class_probabilities(doc_probs)
        predicted_idx, doc_probs = classify_document(sentence_scores)
        
        print(f" Predicting Sentence {document}")
        print(f"\nSentence {i}:")
        print(f"Predicted class: {categories[predicted_idx]}")
        print(f"Document probabilities: {doc_probs}")
        print(f"Class probabilities: {probs}")
        
        if (categories[predicted_idx] == predictedCategory):
            success = success + 1
        else:
            fail = fail + 1
    total = fail + success
    if total > 0:
        accuracy = success / total
        print(f"Correct Prediction: {accuracy}")
    else:
        print("Error in predicition.")


startSessionTest = 35
endSessionTest = 37


def splitSentencesForTest(text):
    
    sentences = re.split(r'\.\s+', text)    
    return sentences


def splitDocumentTest(startSession, endSession, house_and_status):
    for code in house_and_status: 
        file_name = f"cleanedText/{code}_{startSession} to {endSession}.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    sentences = splitSentencesForTest(line.strip())
                    for sentence in sentences:
                        yield sentence 
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")



codes= [18,19,28,19]


testDictionary = {'FailedCommons': [18], 'FailedLords': [19], "SuccesCommons": [28], "SuccessLords":[29]}


for key, codes in testDictionary.items():
    testDictionary[key] = [sentence for sentence in splitDocumentTest(startSessionTest, endSessionTest, house_and_status=codes)]

#for key, sentences in testDictionary.items():
 #   print(f"{key}: {sentences[:5]}")
    
      


for key, code in testDictionary.items():
    testSentences = testDictionary[key]
    
    print (f" Number of sentences for {key} is {len(testSentences)}")
    print(f"The accuracy for {key} is ") 
    scoreSentences(testSentences, models, key)
    

# save the models 
from gensim.models import Word2Vec


for key, model in models.items():
    filename = f"{key}_word2vec_titles.model"
    model.save(filename)
    print(f"Model for {key} saved as {filename}")

    
# dict_keys(['FailedCommons', 'FailedLords', 'SuccesCommons', 'SuccessLords'])


Read FilecleanedText/18_35 to 37.txt
Read FilecleanedText/19_35 to 37.txt
Read FilecleanedText/28_35 to 37.txt
Read FilecleanedText/29_35 to 37.txt
 Number of sentences for FailedCommons is 617
The accuracy for FailedCommons is 
 Predicting Sentence [['amend', 'crown', 'estate', 'act', '1961', 'increase', 'maximum', 'term', 'lease', 'may', 'granted', 'zoological', 'society', 'london', 'respect', 'land', 'regent’s', 'park']]

Sentence 1:
Predicted class: FailedLords
Document probabilities: [1.45036321e-04 9.98661080e-01 1.04588453e-03 1.47999352e-04]
Class probabilities: [1.45036321e-04 9.98661080e-01 1.04588453e-03 1.47999352e-04]
 Predicting Sentence [['amend', 'working', 'time', 'regulations', '1998', 'reduce', 'maximum', 'working', 'week', '48', 'hours', 'per', 'week', '32', 'hours', 'per', 'week', 'provide', 'overtime', 'pay']]

Sentence 2:
Predicted class: FailedCommons
Document probabilities: [9.99999359e-01 1.95687949e-08 6.20936403e-07 2.28532739e-10]
Class probabilities: [9.99

 Predicting Sentence [['require', 'majesty’s', 'government', 'recognise', 'formally', 'republic', 'somaliland;', 'make', 'provision', 'connection', 'establishing', 'diplomatic', 'relations', 'republic', 'somaliland']]

Sentence 35:
Predicted class: SuccessLords
Document probabilities: [8.40588276e-04 5.74717179e-02 4.01823348e-02 9.01505359e-01]
Class probabilities: [8.40588276e-04 5.74717179e-02 4.01823348e-02 9.01505359e-01]
 Predicting Sentence [['make', 'provision', 'changing', 'law', 'rented', 'homes,', 'including', 'provision', 'abolishing', 'fixed', 'term', 'assured', 'tenancies', 'assured', 'shorthold', 'tenancies;', 'imposing', 'obligations', 'landlords', 'others', 'relation', 'rented', 'homes', 'temporary', 'supported', 'accommodation']]

Sentence 36:
Predicted class: FailedCommons
Document probabilities: [1.00000000e+00 1.47639461e-30 2.54778591e-34 3.49571381e-33]
Class probabilities: [1.00000000e+00 1.47639461e-30 2.54778591e-34 3.49571381e-33]
 Predicting Sentence [['redu

 Predicting Sentence [['introduce', 'presumption', 'planning', 'decision-making', 'approving', 'quarry', 'development', 'close', 'proximity', 'settlements;', 'require', 'risks', 'proposed', 'quarrying', 'sites', 'environment', 'public', 'health', 'assessed', 'part', 'planning', 'process;', 'provide', 'decision', 'planning', 'application', 'quarry', 'development', 'may', 'made', 'secretary', 'state']]

Sentence 60:
Predicted class: FailedLords
Document probabilities: [8.37841214e-23 9.99999688e-01 1.00179499e-24 3.12030317e-07]
Class probabilities: [8.37841214e-23 9.99999688e-01 1.00179499e-24 3.12030317e-07]
 Predicting Sentence [['prevent', 'punish', 'theft', 'dogs', 'deter', 'unlawful', 'importation', 'certain', 'animals', 'great', 'britain']]

Sentence 61:
Predicted class: FailedCommons
Document probabilities: [9.97393851e-01 2.25022115e-03 1.09182705e-05 3.45009135e-04]
Class probabilities: [9.97393851e-01 2.25022115e-03 1.09182705e-05 3.45009135e-04]
 Predicting Sentence [['make',

 Predicting Sentence [['transfer', 'responsibility', 'marine', 'licensing', 'marine', 'management', 'organisation', 'local', 'authorities']]

Sentence 95:
Predicted class: SuccessLords
Document probabilities: [2.39689316e-15 2.05401191e-05 1.04846186e-09 9.99979459e-01]
Class probabilities: [2.39689316e-15 2.05401191e-05 1.04846186e-09 9.99979459e-01]
 Predicting Sentence [['require', 'secretary', 'state', 'establish', 'export', 'guarantee', 'scheme', 'small', 'generators', 'low', 'carbon', 'electricity;', 'set', 'tariff,', 'based', 'market', 'rates,', 'sale', 'electricity', 'export', 'guarantee', 'scheme;', 'make', 'provision', 'enable', 'small', 'generators', 'low', 'carbon', 'electricity', 'sell', 'electricity', 'directly', 'local', 'people;', 'place', 'certain', 'duties', 'gas', 'electricity', 'markets', 'authority']]

Sentence 96:
Predicted class: FailedCommons
Document probabilities: [1.00000000e+00 6.16032621e-25 1.02412062e-26 2.07951777e-19]
Class probabilities: [1.00000000e+0

 Predicting Sentence [['provide', 'mechanism', 'early', 'general', 'election', 'held', 'certain', 'circumstances,', 'public', 'demonstrated', 'support', 'election']]

Sentence 130:
Predicted class: SuccesCommons
Document probabilities: [4.25134719e-04 8.00550195e-06 9.99566860e-01 2.61989159e-12]
Class probabilities: [4.25134719e-04 8.00550195e-06 9.99566860e-01 2.61989159e-12]
 Predicting Sentence [['set', 'commission', 'make', 'arrangements', 'debates', 'leaders', 'political', 'parties', 'general', 'election']]

Sentence 131:
Predicted class: FailedLords
Document probabilities: [1.09680594e-01 8.77341126e-01 1.23299889e-02 6.48291468e-04]
Class probabilities: [1.09680594e-01 8.77341126e-01 1.23299889e-02 6.48291468e-04]
 Predicting Sentence [['amend', 'dissolution', 'calling', 'parliament', 'act', '2022', 'provide', 'general', 'election', 'held', 'later', '1', 'december', '2022']]

Sentence 132:
Predicted class: FailedLords
Document probabilities: [1.37473332e-25 9.99998945e-01 4.519

 Predicting Sentence [['remove', 'requirement', 'voters', 'show', 'identity', 'document', 'order', 'vote']]

Sentence 155:
Predicted class: FailedCommons
Document probabilities: [9.99882661e-01 5.53250148e-05 1.35008234e-05 4.85134961e-05]
Class probabilities: [9.99882661e-01 5.53250148e-05 1.35008234e-05 4.85134961e-05]
 Predicting Sentence [['introduce', 'system', 'proportional', 'representation', 'parliamentary', 'elections,', 'elections', 'directly-elected', 'mayors', 'england,', 'local', 'authority', 'elections', 'england', 'police', 'crime', 'commissioner', 'elections', 'england', 'wales']]

Sentence 156:
Predicted class: SuccessLords
Document probabilities: [4.04032805e-11 4.07007303e-02 2.81858328e-08 9.59299241e-01]
Class probabilities: [4.04032805e-11 4.07007303e-02 2.81858328e-08 9.59299241e-01]
 Predicting Sentence [['introduce', 'system', 'proportional', 'representation', 'local', 'authority', 'elections', 'england', 'parliamentary', 'general', 'elections;', 'alter', 'meth

 Predicting Sentence [['establish', 'right', 'breathe', 'clean', 'air;', 'make', 'provision', 'purpose', 'reducing', 'indoor', 'outdoor', 'air', 'pollution,', 'including', 'greenhouse', 'gases;', 'set', 'minimum', 'standards', 'air', 'quality', 'workplaces,', 'homes', 'public', 'spaces;', 'require', 'monitoring', 'air', 'quality;', 'require', 'secretary', 'state', 'publish', 'strategy', 'reducing', 'air', 'pollution,', 'including', 'setting', 'targets', 'measures', 'air', 'quality,', 'report', 'parliament', 'annually', 'implementation', 'strategy;', 'give', 'powers', 'office', 'environmental', 'protection', 'enforce', 'legislation', 'relating', 'air', 'quality', 'reduction', 'greenhouse', 'gas', 'emissions;', 'make', 'provision', 'purpose', 'reducing', 'pollution', 'vehicles;', 'place', 'duty', 'secretary', 'state', 'encourage', 'facilitate', 'forms', 'active', 'travel', 'publish', 'strategy', 'reducing', 'emissions', 'transport;', 'require', 'secretary', 'state', 'promote', 'public', 



 Predicting Sentence [['set', 'target', 'number', 'glioblastoma', 'patients', 'take', 'part', 'clinical', 'trials', 'year;', 'require', 'training', 'medical', 'oncologists', 'include', 'training', 'relating', 'brain', 'cancers;', 'provide', 'drug', 'licensed', 'use', 'tumours', 'must', 'trialled', 'people', 'brain', 'tumours;', 'make', 'provision', 'relation', 'neuro-oncology', 'multidisciplinary', 'teams', 'nhs,', 'including', 'requirement', 'team', 'must', 'include', 'medical', 'oncologist;', 'require', 'manufacturers', 'drugs', 'licensed', 'treat', 'tumours', 'make', 'drugs', 'available', 'specified', 'circumstances', 'clinical', 'trials', 'relating', 'brain', 'tumours']]

Sentence 207:
Predicted class: FailedCommons
Document probabilities: [1.00000000e+00 2.10784059e-39 5.82343475e-46 2.18134887e-43]
Class probabilities: [1.00000000e+00 2.10784059e-39 5.82343475e-46 2.18134887e-43]
 Predicting Sentence [['require', 'government', 'publish', 'assessment', 'incidences', 'bowel', 'cond

 Predicting Sentence [['provide', 'parliamentary', 'approval', 'trade', 'agreements;', 'place', 'duty', 'secretary', 'state', 'consider', 'uk', 'agricultural,', 'environmental', 'animal', 'welfare', 'standards', 'negotiating', 'trade', 'agreements;', 'require', 'trade', 'agriculture', 'commission', 'assess', 'effects', 'potential', 'trade', 'agreements', 'farming,', 'rural', 'environment', 'animal', 'welfare', 'produce', 'associated', 'reports;', 'require', 'secretary', 'state', 'lay', 'reports', 'parliament']]

Sentence 251:
Predicted class: SuccessLords
Document probabilities: [1.19534817e-51 1.23997656e-16 1.91173968e-17 1.00000000e+00]
Class probabilities: [1.19534817e-51 1.23997656e-16 1.91173968e-17 1.00000000e+00]
 Predicting Sentence [['prohibit', 'employers', 'retaining', 'tips', 'gratuities', 'intended', 'staff;', 'make', 'provision', 'division', 'tips', 'gratuities', 'staff']]

Sentence 252:
Predicted class: FailedCommons
Document probabilities: [0.93596308 0.0009648  0.0049

 Predicting Sentence [['amend', 'public', 'health', '(control', 'disease)', 'act', '1984', 'make', 'provision', 'parliamentary', 'scrutiny', 'regulations', 'made', 'act']]

Sentence 277:
Predicted class: FailedLords
Document probabilities: [7.83235927e-42 9.99999311e-01 7.34043332e-11 6.89384717e-07]
Class probabilities: [7.83235927e-42 9.99999311e-01 7.34043332e-11 6.89384717e-07]
 Predicting Sentence [['require', 'government', 'regard', 'desirability', 'boards', 'public', 'bodies', 'including', 'least', 'one', 'person', 'relevant', 'experience', 'least', 'one', 'scotland,', 'wales', 'northern', 'ireland']]

Sentence 278:
Predicted class: FailedCommons
Document probabilities: [9.99988429e-01 4.77611459e-08 1.15143384e-05 9.11565917e-09]
Class probabilities: [9.99988429e-01 4.77611459e-08 1.15143384e-05 9.11565917e-09]
 Predicting Sentence [['establish', 'public', 'advocate', 'provide', 'advice', 'to,', 'act', 'data', 'controller', 'for,', 'representatives', 'deceased', 'major', 'incid

 Predicting Sentence [['establish', 'national', 'register', 'green', 'belt', 'land', 'england;', 'restrict', 'ability', 'local', 'authorities', 'de-designate', 'green', 'belt', 'land;', 'make', 'provision', 'future', 'development', 'de-designated', 'green', 'belt', 'land']]

Sentence 352:
Predicted class: FailedCommons
Document probabilities: [1.00000000e+00 6.39536281e-59 2.88287407e-45 2.78789639e-24]
Class probabilities: [1.00000000e+00 6.39536281e-59 2.88287407e-45 2.78789639e-24]
 Predicting Sentence [['regulate', 'charges', 'for,', 'advertising', 'of,', 'goods', 'delivery', 'services;', 'make', 'provision', 'transport', 'infrastructure', 'remote', 'areas', 'promote', 'use', 'goods', 'delivery', 'services']]

Sentence 353:
Predicted class: FailedCommons
Document probabilities: [1.00000000e+00 2.14443339e-32 5.22171682e-22 3.79489242e-28]
Class probabilities: [1.00000000e+00 2.14443339e-32 5.22171682e-22 3.79489242e-28]
 Predicting Sentence [['set', 'commission', 'make', 'arrangeme

 Predicting Sentence [['require', 'government', 'ofgem', 'conduct', 'act', 'review', 'electricity', 'transmission', 'grid', 'associated', 'charges,', 'include', 'consideration', 'abolishing', 'charge', 'differentials', 'based', 'geographic', 'location,', 'incentivising', 'renewable', 'energy', 'generation', 'maximise', 'energy', 'output,', 'minimising', 'passing', 'charge', 'fluctuation', 'risk', 'consumers', 'form', 'higher', 'prices']]

Sentence 370:
Predicted class: SuccesCommons
Document probabilities: [6.73297575e-16 2.01972514e-05 9.79900702e-01 2.00791007e-02]
Class probabilities: [6.73297575e-16 2.01972514e-05 9.79900702e-01 2.00791007e-02]
 Predicting Sentence [['make', 'provision', 'electric', 'vehicle', 'charging', 'points', 'new', 'buildings;', 'conected', 'purposes']]

Sentence 371:
Predicted class: FailedCommons
Document probabilities: [9.99997969e-01 1.52965549e-11 4.31489310e-14 2.03135393e-06]
Class probabilities: [9.99997969e-01 1.52965549e-11 4.31489310e-14 2.0313539

 Predicting Sentence [['require', 'chancellor', 'exchequer', 'report', 'parliament', 'proposals', 'replace', 'barnett', 'formula', 'used', 'calculate', 'adjustments', 'public', 'expenditure', 'allocated', 'scotland,', 'wales', 'northern', 'ireland', 'statutory', 'scheme', 'allocation', 'resources', 'based', 'assessment', 'relative', 'needs']]

Sentence 426:
Predicted class: FailedCommons
Document probabilities: [1.00000000e+00 6.42908833e-26 1.61884321e-51 6.33280305e-29]
Class probabilities: [1.00000000e+00 6.42908833e-26 1.61884321e-51 6.33280305e-29]
 Predicting Sentence [['require', 'installation', 'automated', 'external', 'defibrillators', 'public', 'buildings,', 'sporting', 'facilities,', 'schools,', 'higher', 'education', 'education', 'skills', 'facilities,', 'facilities', 'provide', 'care', 'vulnerable', 'people;', 'make', 'associated', 'provision', 'training', 'signage']]

Sentence 427:
Predicted class: FailedCommons
Document probabilities: [1.00000000e+00 1.06005933e-32 4.200

 Predicting Sentence [['require', 'minister', 'move', 'motion', 'house', 'commons', 'seeking', 'establish', 'select', 'committee', 'monitor', 'overseas', 'development', 'assistance', 'expenditure', 'government', 'departments']]

Sentence 497:
Predicted class: SuccessLords
Document probabilities: [2.62103597e-45 1.14383185e-39 1.60940003e-03 9.98390600e-01]
Class probabilities: [2.62103597e-45 1.14383185e-39 1.60940003e-03 9.98390600e-01]
 Predicting Sentence [['make', 'provision', 'restructuring', 'urgent', 'care', 'facilities', 'north', 'northamptonshire']]

Sentence 498:
Predicted class: SuccesCommons
Document probabilities: [0.00829849 0.09737913 0.74291264 0.15140974]
Class probabilities: [0.00829849 0.09737913 0.74291264 0.15140974]
 Predicting Sentence [['require', 'secretary', 'state', 'make', 'non-gender-specific', 'passports', 'available', 'non-gendered,', 'non-binary', 'people', 'identify', 'as,', 'exclusively', 'as,', 'male', 'female']]

Sentence 499:
Predicted class: Succes

 Predicting Sentence [['make', 'provision', 'require', 'cost-benefit', 'analysis', 'independent', 'audit', 'payments', 'made', 'government', 'foreign', 'country', 'international', 'organisation']]

Sentence 529:
Predicted class: SuccessLords
Document probabilities: [2.72628948e-20 3.50152650e-03 3.73308107e-03 9.92765392e-01]
Class probabilities: [2.72628948e-20 3.50152650e-03 3.73308107e-03 9.92765392e-01]
 Predicting Sentence [['make', 'provision', 'definition', 'international', 'development', 'assistance']]

Sentence 530:
Predicted class: SuccesCommons
Document probabilities: [2.27682232e-08 6.99916601e-03 8.30177749e-01 1.62823062e-01]
Class probabilities: [2.27682232e-08 6.99916601e-03 8.30177749e-01 1.62823062e-01]
 Predicting Sentence [['require', 'secretary', 'state', 'report', 'use', 'official', 'development', 'assistance', 'increase', 'availability', 'women’s', 'sanitary', 'products']]

Sentence 531:
Predicted class: SuccesCommons
Document probabilities: [2.53343616e-02 2.056

 Predicting Sentence [['require', 'advertisers,', 'broadcasters', 'publishers', 'display', 'logo', 'cases', 'image', 'human', 'body', 'body', 'part', 'digitally', 'altered', 'proportions']]

Sentence 579:
Predicted class: FailedCommons
Document probabilities: [9.99754458e-01 2.09515285e-04 5.33595899e-06 3.06903442e-05]
Class probabilities: [9.99754458e-01 2.09515285e-04 5.33595899e-06 3.06903442e-05]
 Predicting Sentence [['create', 'offence', 'desecrating', 'war', 'memorial']]

Sentence 580:
Predicted class: SuccesCommons
Document probabilities: [0.0173654  0.31803822 0.45871458 0.20588179]
Class probabilities: [0.0173654  0.31803822 0.45871458 0.20588179]
 Predicting Sentence [['make', 'provision', 'reduction', 'burdens', 'resulting', 'legislation', 'businesses', 'organisations', 'individuals;', 'make', 'provision', 'repeal', 'amendment', 'regulations;', 'make', 'provision', 'exercise', 'regulatory', 'powers', 'functions']]

Sentence 581:
Predicted class: SuccesCommons
Document prob

 Predicting Sentence [['make', 'provision', 'improving', 'air', 'quality']]

Sentence 615:
Predicted class: FailedCommons
Document probabilities: [9.25752796e-01 1.98876879e-02 3.66946192e-04 5.39925697e-02]
Class probabilities: [9.25752796e-01 1.98876879e-02 3.66946192e-04 5.39925697e-02]
 Predicting Sentence [['amend', 'abortion', 'act', '1967', 'exclude', 'cleft', 'lip,', 'cleft', 'palate', 'clubfoot', 'qualifying', 'physical', 'abnormalities', 'purposes', 'medical', 'termination', 'pregnancy', 'section', '1(1)(d)']]

Sentence 616:
Predicted class: FailedCommons
Document probabilities: [9.95366868e-01 3.37895052e-13 1.79996753e-06 4.63133157e-03]
Class probabilities: [9.95366868e-01 3.37895052e-13 1.79996753e-06 4.63133157e-03]
 Predicting Sentence [['abolish', 'business', 'rates.']]

Sentence 617:
Predicted class: FailedCommons
Document probabilities: [0.41736327 0.36580737 0.03951937 0.17731   ]
Class probabilities: [0.41736327 0.36580737 0.03951937 0.17731   ]
Correct Prediction:

 Predicting Sentence [['amend', 'child', 'benefit', '(rates)', 'regulations', '2006', 'make', 'provision', 'vary', 'rate', 'child', 'benefit', 'course', 'childhood', 'enable', 'eligible', 'parents', 'receive', 'higher', 'rate', 'child’s', 'early', 'years', 'correspondingly', 'reduced', 'rate', 'child', 'older', 'amend', 'law', 'relating', 'workplace', 'information', 'consultation,', 'employment', 'protection', 'trade', 'union', 'rights', 'provide', 'safeguards', 'workers', 'dismissal', 're-engagement', 'inferior', 'terms', 'conditions', 'make', 'provision', 'include', 'non-religious', 'philosophical', 'convictions', 'within', 'school', 'curriculum;', 'require', 'persons', 'hold', 'non-religious', 'philosophical', 'convictions', 'must', 'represented', 'standing', 'advisory', 'councils', 'religious', 'education', 'agreed', 'syllabus', 'conferences', 'require', 'secretary', 'state', 'achieve', 'nature', 'target', 'united', 'kingdom', 'require', 'electrical', 'safety', 'certificate', 'prov

 Predicting Sentence [['provide', 'certain', 'protections', 'persons', 'live', 'together', 'lived', 'together', 'couple;', 'make', 'provision', 'property', 'deceased', 'persons', 'survived', 'cohabitant', 'establish', 'right', 'breathe', 'clean', 'air;', 'require', 'secretary', 'state', 'achieve', 'maintain', 'clean', 'air', 'england', 'wales;', 'involve', 'public', 'health', 'england', 'setting', 'reviewing', 'pollutants', 'limits;', 'enhance', 'powers,', 'duties', 'functions', 'environment', 'agency,', 'committee', 'climate', 'change,', 'local', 'authorities', '(including', 'port', 'authorities),', 'civil', 'aviation', 'authority,', 'highways', 'england,', 'historic', 'england', 'natural', 'england', 'relation', 'air', 'pollution;', 'establish', 'citizens’', 'commission', 'clean', 'air', 'powers', 'institute', 'intervene', 'legal', 'proceedings;', 'require', 'secretary', 'state', 'relevant', 'national', 'authorities', 'apply', 'environmental', 'principles', 'carrying', 'duties', 'act

 Predicting Sentence []

Sentence 68:
Predicted class: FailedCommons
Document probabilities: [nan nan nan nan]
Class probabilities: [nan nan nan nan]
 Predicting Sentence [['year;', 'appropriate', 'supply', 'authorised', 'year', 'act']]

Sentence 69:
Predicted class: SuccesCommons
Document probabilities: [4.01116654e-14 1.92973785e-04 9.99787164e-01 1.98624583e-05]
Class probabilities: [4.01116654e-14 1.92973785e-04 9.99787164e-01 1.98624583e-05]
 Predicting Sentence []

Sentence 70:
Predicted class: FailedCommons
Document probabilities: [nan nan nan nan]
Class probabilities: [nan nan nan nan]
 Predicting Sentence [['supply', 'appropriation', '(anticipation', 'adjustments)', 'act', '2021.a', 'bill', 'authorise', 'use', 'resources', 'years', 'ending', '31', 'march', '2021,', '31', 'march', '2022', '31', 'march', '2023;', 'authorise', 'issue', 'sums', 'consolidated', 'fund', 'years;', 'appropriate', 'supply', 'authorised', 'act', 'years', 'ending', '31', 'march', '2021', '31', 'march', '

 Predicting Sentence [['make', 'provision', 'appointment', 'forensic', 'science', 'regulator;', 'make', 'provision', 'regulator', 'regulation', 'forensic', 'science']]

Sentence 173:
Predicted class: SuccesCommons
Document probabilities: [2.71476574e-09 2.98228474e-22 9.99999997e-01 6.96239014e-14]
Class probabilities: [2.71476574e-09 2.98228474e-22 9.99999997e-01 6.96239014e-14]
 Predicting Sentence [['make', 'provision', 'application', 'regulatory', 'reform', '(fire', 'safety)', 'order', '2005', 'building', 'contains', 'two', 'sets', 'domestic', 'premises;', 'confer', 'power', 'amend', 'order', 'future', 'purposes', 'changing', 'premises', 'applies']]

Sentence 174:
Predicted class: SuccesCommons
Document probabilities: [3.09691190e-47 5.89195673e-20 9.99994977e-01 5.02322185e-06]
Class probabilities: [3.09691190e-47 5.89195673e-20 9.99994977e-01 5.02322185e-06]
 Predicting Sentence [['make', 'provision', 'financial', 'services', 'markets;', 'make', 'provision', 'debt', 'respite', 's

Model for SuccesCommons saved as SuccesCommons_word2vec_titles.model
Model for SuccessLords saved as SuccessLords_word2vec_titles.model


In [None]:
def splitIntoSets(code):
    csv_path = f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}_17 to 39_withTitleScores.csv"

    df = pd.read_csv(csv_path)

    total_rows = len(df)
    testingSetSize = round(0.1 * total_rows)
    remainingSize = total_rows - testingSetSize


    testingSet = df.iloc[:testingSetSize]['Bill Id'].tolist()


    remaining_df = df.iloc[testingSetSize:].sample(frac=1, random_state=42) 
    
    validationSet = remaining_df.iloc[:testingSetSize]['Bill Id'].tolist()


    trainingSet = remaining_df.iloc[testingSetSize:]['Bill Id'].tolist()
    
    return trainingSet, testingSet, validationSet

remaining = [28, 29, 18, 19]

for code in remaining :
    trainingIndexes, testingIndexes, validationIndexes = splitIntoSets(code)
    df = pd.read_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}_17 to 39_withTitleScores.csv")


    df_training = df[df['Bill Id'].isin(trainingIndexes)]
    df_testing = df[df['Bill Id'].isin(testingIndexes)]
    df_validation = df[df['Bill Id'].isin(validationIndexes)]
    df_training.to_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}\\Training\\{code}_training.csv", index=False)
    df_testing.to_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}\\Testing\\{code}_testing.csv", index=False)
    df_validation.to_csv(f"C:\\Users\\ander\\Downloads\\MLP\\CSVFilesWithTitleScores\\{code}\\Validation\\{code}_validation.csv", index=False)
    
    