In [None]:
import sys
import nltk
import numpy as np
import pandas as pd
from copy import deepcopy
import gensim
from gensim.models import Word2Vec
from gensim.models import Phrases
import re
from nltk.corpus import stopwords
from scipy.special import expit as sigmoid 
import os
import re
import fitz  


In [5]:
#Function Definitions 

# 1 Fail
# 2 Success
# 8 Commons
# 9 Lords
#18 failed commons


nltk.download('stopwords')




def splitSentences(text):
    
    sentences = re.split(r'\.\s+', text)
    stop_words = set(stopwords.words('english'))
    
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    filtered_sentences = []
    for sentence in sentences:
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        filtered_sentences.append(filtered_words)
        
    return filtered_sentences

def splitDocument(house_and_status = [18,19,28,29]):
    for code in house_and_status: 
        file_name = f"cleanedTextFull/{code}/Training/training_text{code}.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    yield splitSentences(line)
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")
# returns a list of lists . List each sentnece, and inside there is a list of words for each sentence


def splitDocumentALL(startSession, endSession, house_and_status = [18,19,28,29]):
    for code in house_and_status: 
        file_name = f"cleanedTextFull/{code}_{startSession} to {endSession} fullText.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    yield splitSentences(line)
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")


    



# input (sentence: list of words, model: gensim model, window: window= windowSize of word2vec, 
#debug: print intermediate calculations for debugging)

def score_sentence(sentence, model, window=7, debug=False):
    log_prob = 0.0 # total log prob for the sentence
    sentence_length = len(sentence)
    word_pair_probs = []  

    # Code for equation 1 
    for index, center_word in enumerate(sentence):
        if center_word not in model.wv:
            if debug:
                print(f"Center word '{center_word}' not in vocabulary.")
            continue
        center_vector = model.wv[center_word]

        start = max(0, index - window)
        end = min(sentence_length, index + window + 1)

        for j in range(start, end):
            if j == index:
                continue
            context_word = sentence[j]
            if context_word not in model.wv:
                if debug:
                    print(f"Context word '{context_word}' not in vocabulary.")
                continue
            context_vector = model.wv[context_word]

            dot_product = np.dot(center_vector, context_vector)
            prob = sigmoid(dot_product)

            word_pair_probs.append((center_word, context_word, prob))

            log_prob += np.log(prob + 1e-10)

    if debug:
        print("\n--- Word Pair Probabilities ---")
        for center, context, prob in word_pair_probs:
            print(f"p({context} | {center}) = {prob:.6f}")

    return log_prob




# Score an entire document (S sentences) under all models (Equation 2)
# input (sentencces:  a list of sentences ,models: the dictionary of models, window: the window size for score sentences)
# outpur: a sentences x categories (failed , succesful ....) with eahc sentence score according to score_sentence

def score_document(sentences, models, window=5):
    """
    Compute the score x category matrix of sentence scores for a document.
    
    sentences: list of sentences, each sentence is a list of words
    models: dict of {category: Word2Vec model}
    """
    S = len(sentences)
    C = len(models)
    
    sentence_scores = np.zeros((S, C))
    
    for s_idx, sentence in enumerate(sentences):
        for c_idx, (category, model) in enumerate(models.items()):
            sentence_scores[s_idx, c_idx] = score_sentence(sentence, model, window)
    
    return sentence_scores



# calculate document probabilities (Equation 5)

# input: the sxc array
# output: a 1x cateories array with the average score for all sentences in document 
def document_probabilities(sentence_scores):

    return sentence_scores.mean(axis=0)



# compute class probabilities ( Equation 3)

# input:  the array from document_probabilities
#ouput: normalized probabilities after bayes rule is applied #todo: change the priors to correspond to each class 
def class_probabilities(doc_probs):
    """
    Compute class probabilities using Bayes rule.
    Assuming uniform priors.
    """
    priors = np.ones(len(doc_probs)) / len(doc_probs)
    # bayes rule
    probs = (doc_probs * priors) / np.sum(doc_probs * priors)
    return probs



# classify the document (Equation 6)
# checks which of the numbers in the 1d array from document probabilities (the average across the classes ) is biggest and returns the index and array (for debuging) 
 
def classify_document(sentence_scores):
    doc_probs = document_probabilities(sentence_scores)
    predicted_class_idx = np.argmax(doc_probs)
    return predicted_class_idx, doc_probs



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
## Evaluate the trained models 

def scoreSentences(listOfSentences, modelsList, predictedCategory):
    success= 0
    fail =0
    
    categories = list(modelsList.keys())
    
    for i, sentence_text in enumerate(listOfSentences, 1):
        document = splitSentences(sentence_text)
        sentence_scores = score_document(document, modelsList, window=5)
        doc_probs = document_probabilities(sentence_scores)
        probs = class_probabilities(doc_probs)
        predicted_idx, doc_probs = classify_document(sentence_scores)
        
        print(f" Predicting Sentence {document}")
        print(f"\nSentence {i}:")
        print(f"Predicted class: {categories[predicted_idx]}")
        print(f"Document probabilities: {doc_probs}")
        print(f"Class probabilities: {probs}")
        
        predicted_class = categories[predicted_idx]
        
        if (predicted_class == predictedCategory or
            (predicted_class in ["FailedCommons", "FailedLords"] and 
             predictedCategory in ["FailedCommons", "FailedLords"])):
            success += 1
        else:
            fail += 1
    total = fail + success
    
    if total > 0:
        accuracy = success / total
        print(f"Correct Prediction: {accuracy}")
    else:
        print("Error in predicition.")






In [7]:
#Load models and populate models dictionar

model_18 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\FailedCommons_word2vec_workers=4, hs=1, sg=1, negative=0, min_count=10, vector_size =300,window = 7.model")
model_19 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\FailedLords_word2vec_workers=4, hs=1, sg=1, negative=0, min_count=10, vector_size =300,window = 7.model")
model_29 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\SuccessLords_word2vec_workers=4, hs=1, sg=1, negative=0, min_count=10, vector_size =300,window = 7.model")
model_28 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\Model_28_re_trained_vs_100")



houseDictionary = {'FailedCommons': [18], 'FailedLords': [19], "SuccesCommons": [28], "SuccessLords":[29]}

models ={}

for key, indices in houseDictionary.items():
    for index in indices:
        if index == 18:
            models[key] = model_18
        elif index == 19:
            models[key] = model_19
        elif index == 28:
            models[key] = model_28
        elif index == 29:
            models[key] = model_29



In [None]:
# Methods to compute the scores for the sentences per individual cell and add the whole array 
#as well as the code for the actual predicted score 
from tqdm import tqdm
def scoreDocumentCSV(text, models):
    filtered_sentences = splitSentences(text)
    sentence_scores = score_document(filtered_sentences, models, window=7)
    index, doc_probs = classify_document(sentence_scores)
    
    return index, doc_probs
        

def appendScoresToFile(df, models, path):
    df['predicted_class'] = None
    df['doc_probs'] = None
    categories = list(models.keys())

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
        result_index, probs_array = scoreDocumentCSV(row['extracted_text'], models)
        
        df.at[index, 'predicted_class'] = categories[result_index]
        df.at[index, 'doc_probs'] = probs_array
    df.to_csv(path, index=False)


In [30]:
## Score the training
remaining = [28]
basePath = r"C:\Users\ander\Downloads\MLP\cleanedTexWithID" 

for i in remaining:
    filePath = f"{i}\\Training\\training_text{i}.csv"
    path = os.path.join(basePath, filePath)
    df = pd.read_csv(path, error_bad_lines=False)  
    print(f"Df size {len(df)}")
    appendScoresToFile(df, models, path)
    


Df size 402
DF size inside append scores 402


Processing rows: 100%|███████████████████████████████████████████████████████████████| 402/402 [47:32<00:00,  7.10s/it]


In [10]:
#### Score the validation and evaluate
remaining = [28, 29, 18, 19]
accessKeys = { 18: 'FailedCommons', 19: 'FailedLords', 28: "SuccesCommons", 29: "SuccessLords"}
basePath = r"C:\Users\ander\Downloads\MLP\cleanedTexWithID" 


models 

for i in remaining:
    filePath = f"{i}\\Validation\\validation_text{i}.csv"
    path = os.path.join(basePath, filePath)
    df = pd.read_csv(path, error_bad_lines=False) 
    print(f"Df {i}size {len(df)}")
    appendScoresToFile(df, models, path)
    dfEvaluate = pd.read_csv(path, error_bad_lines=False) 
    
    dfEvaluate['overall_accuracy'] = None
    correct_bills = 0
    for index, row in df.iterrows():
        if row['predicted_class'] == accessKeys[i]:
            correct_bills += 1
    total_bills = len(dfEvaluate)
    accuracy = correct_bills / total_bills
    
    print(F" Accuracy of {accessKeys[i]} is {accuracy}")
    
## For success commons , count success lords and success commons as predicted class and accuracy 0.8, if not 0.26   

Df 28size 45


Processing rows: 100%|█████████████████████████████████████████████████████████████████| 45/45 [09:10<00:00, 12.24s/it]


 Accuracy of SuccesCommons is 0.26666666666666666
Df 29size 22


Processing rows: 100%|█████████████████████████████████████████████████████████████████| 22/22 [00:27<00:00,  1.27s/it]


 Accuracy of SuccessLords is 0.8181818181818182
Df 18size 108


Processing rows: 100%|███████████████████████████████████████████████████████████████| 108/108 [01:36<00:00,  1.12it/s]


 Accuracy of FailedCommons is 0.6111111111111112
Df 19size 57


Processing rows: 100%|█████████████████████████████████████████████████████████████████| 57/57 [01:17<00:00,  1.36s/it]

 Accuracy of FailedLords is 0.6666666666666666





FailedCommons
FailedLords
SuccesCommons
SuccessLords
