In [47]:
import sys
import nltk
import numpy as np
import pandas as pd
from copy import deepcopy
import gensim
from gensim.models import Word2Vec
from gensim.models import Phrases
import re
from nltk.corpus import stopwords
from scipy.special import expit as sigmoid 
import os
import re
import fitz  


In [48]:
#Function Definitions 

# 1 Fail
# 2 Success
# 8 Commons
# 9 Lords
#18 failed commons


nltk.download('stopwords')




def splitSentences(text):
    
    sentences = re.split(r'\.\s+', text)
    stop_words = set(stopwords.words('english'))
    
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    filtered_sentences = []
    for sentence in sentences:
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        filtered_sentences.append(filtered_words)
        
    return filtered_sentences

def splitDocument(house_and_status = [18,19,28,29]):
    for code in house_and_status: 
        file_name = f"cleanedTextFull/{code}/Training/training_text{code}.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    yield splitSentences(line)
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")
# returns a list of lists . List each sentnece, and inside there is a list of words for each sentence


def splitDocumentALL(startSession, endSession, house_and_status = [18,19,28,29]):
    for code in house_and_status: 
        file_name = f"cleanedTextFull/{code}_{startSession} to {endSession} fullText.txt"
        try:
            with open(file_name, 'r', encoding="utf-8") as file:
                for line in file:
                    yield splitSentences(line)
            print(f"Read File{file_name}")
        except UnicodeDecodeError as e:
            print(f"Unicode decode error")


    



# input (sentence: list of words, model: gensim model, window: window= windowSize of word2vec, 
#debug: print intermediate calculations for debugging)

def score_sentence(sentence, model, window=7, debug=False):
    log_prob = 0.0 # total log prob for the sentence
    sentence_length = len(sentence)
    word_pair_probs = []  

    # Code for equation 1 
    for index, center_word in enumerate(sentence):
        if center_word not in model.wv:
            if debug:
                print(f"Center word '{center_word}' not in vocabulary.")
            continue
        center_vector = model.wv[center_word]

        start = max(0, index - window)
        end = min(sentence_length, index + window + 1)

        for j in range(start, end):
            if j == index:
                continue
            context_word = sentence[j]
            if context_word not in model.wv:
                if debug:
                    print(f"Context word '{context_word}' not in vocabulary.")
                continue
            context_vector = model.wv[context_word]

            dot_product = np.dot(center_vector, context_vector)
            prob = sigmoid(dot_product)

            word_pair_probs.append((center_word, context_word, prob))

            log_prob += np.log(prob + 1e-10)

    if debug:
        print("\n--- Word Pair Probabilities ---")
        for center, context, prob in word_pair_probs:
            print(f"p({context} | {center}) = {prob:.6f}")

    return log_prob




# Score an entire document (S sentences) under all models (Equation 2)
# input (sentencces:  a list of sentences ,models: the dictionary of models, window: the window size for score sentences)
# outpur: a sentences x categories (failed , succesful ....) with eahc sentence score according to score_sentence

def score_document(sentences, models, window=5):
    """
    Compute the score x category matrix of sentence scores for a document.
    
    sentences: list of sentences, each sentence is a list of words
    models: dict of {category: Word2Vec model}
    """
    S = len(sentences)
    C = len(models)
    
    sentence_scores = np.zeros((S, C))
    
    for s_idx, sentence in enumerate(sentences):
        for c_idx, (category, model) in enumerate(models.items()):
            sentence_scores[s_idx, c_idx] = score_sentence(sentence, model, window)
    
    return sentence_scores



# calculate document probabilities (Equation 5)

# input: the sxc array
# output: a 1x cateories array with the average score for all sentences in document 
def document_probabilities(sentence_scores):

    return sentence_scores.mean(axis=0)



# compute class probabilities ( Equation 3)

# input:  the array from document_probabilities
#ouput: normalized probabilities after bayes rule is applied #todo: change the priors to correspond to each class 
def class_probabilities(log_doc_probs):

    num_classes = len(log_doc_probs)
    doc_probs = np.exp(log_doc_probs - np.max(log_doc_probs))  
    priors = np.ones(num_classes) / num_classes
    numerator = doc_probs * priors
    denominator = np.sum(numerator)
    probs = numerator / denominator
    
    return probs


# classify the document (Equation 6)
# checks which of the numbers in the 1d array from document probabilities (the average across the classes ) is biggest and returns the index and array (for debuging) 
 
def classify_document(sentence_scores):
    doc_probs = document_probabilities(sentence_scores)
    class_probs = class_probabilities(doc_probs)
    predicted_class_idx = np.argmax(class_probs)
    return predicted_class_idx, class_probs



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ander\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
## Evaluate the trained models 

def scoreSentences(listOfSentences, modelsList, predictedCategory):
    success= 0
    fail =0
    
    categories = list(modelsList.keys())
    
    for i, sentence_text in enumerate(listOfSentences, 1):
        document = splitSentences(sentence_text)
        sentence_scores = score_document(document, modelsList, window=5)
        doc_probs = document_probabilities(sentence_scores)
        probs = class_probabilities(doc_probs)
        predicted_idx, doc_probs = classify_document(sentence_scores)
        
        print(f" Predicting Sentence {document}")
        print(f"\nSentence {i}:")
        print(f"Predicted class: {categories[predicted_idx]}")
        print(f"Document probabilities: {doc_probs}")
        print(f"Class probabilities: {probs}")
        
        predicted_class = categories[predicted_idx]
        
        if (predicted_class == predictedCategory or
            (predicted_class in ["FailedCommons", "FailedLords"] and 
             predictedCategory in ["FailedCommons", "FailedLords"])):
            success += 1
        else:
            fail += 1
    total = fail + success
    
    if total > 0:
        accuracy = success / total
        print(f"Correct Prediction: {accuracy}")
    else:
        print("Error in predicition.")






In [50]:
#Load models and populate models dictionar

model_18 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\TitlesModels\FailedCommons_word2vec_titles.model")
model_19 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\TitlesModels\FailedLords_word2vec_titles.model")
model_29 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\TitlesModels\SuccessLords_word2vec_titles.model")
model_28 = Word2Vec.load(r"C:\Users\ander\Downloads\MLP\TitlesModels\SuccesCommons_word2vec_titles.model")



houseDictionary = {'FailedCommons': [18], 'FailedLords': [19], "SuccesCommons": [28], "SuccessLords":[29]}

models ={}

for key, indices in houseDictionary.items():
    for index in indices:
        if index == 18:
            models[key] = model_18
        elif index == 19:
            models[key] = model_19
        elif index == 28:
            models[key] = model_28
        elif index == 29:
            models[key] = model_29



In [51]:
# Methods to compute the scores for the sentences per individual cell and add the whole array 
#as well as the code for the actual predicted score 
from tqdm import tqdm
def scoreDocumentCSV(text, models):
    filtered_sentences = splitSentences(text)
    sentence_scores = score_document(filtered_sentences, models, window=7)
    index, doc_probs = classify_document(sentence_scores)
    
    return index, doc_probs
        

    
def appendScoresToFile(df, models, path):
    df['predicted_class'] = None
    df['doc_probs'] = None
    categories = list(models.keys())

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
        result_index, probs_array = scoreDocumentCSV(row['Long Title'], models)
        
        df.at[index, 'predicted_class'] = categories[result_index]
        df.at[index, 'doc_probs'] = probs_array
    df.to_csv(path, index=False)


In [36]:
#Split Into validation and 


In [52]:
import pandas as pd
import os
from tqdm import tqdm


remaining = [28, 29, 18, 19]
basePath = r"C:\Users\ander\Downloads\MLP\CSVFilesWithTitleScores"

for i in remaining:
    filePath = f"{i}\\Testing\\{i}_testing.csv"
    path = os.path.join(basePath, filePath)
    df = pd.read_csv(path, on_bad_lines='skip') 
    df['Long Title'] = df['Long Title'].astype(str) 
    print(f"Df size {len(df)}")
    appendScoresToFile(df, models, path)


Df size 58


Processing rows: 100%|████████████████████████████████████████████████████████████████| 58/58 [00:00<00:00, 231.98it/s]


Df size 18


Processing rows: 100%|████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 572.54it/s]


Df size 250


Processing rows: 100%|██████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 308.69it/s]


Df size 70


Processing rows: 100%|████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 344.96it/s]


In [46]:
#### Score the validation and evaluate
remaining = [28, 29, 18, 19]
accessKeys = { 18: 'FailedCommons', 19: 'FailedLords', 28: "SuccesCommons", 29: "SuccessLords"}
basePath = r"C:\Users\ander\Downloads\MLP\CSVFilesWithTitleScores"


models 

for i in remaining:
    filePath = f"{i}\\Validation\\{i}_validation.csv"
    path = os.path.join(basePath, filePath)
    df = pd.read_csv(path, error_bad_lines=False) 
    print(f"Df {i}size {len(df)}")
    #appendScoresToFile(df, models, path)
    dfEvaluate = pd.read_csv(path, error_bad_lines=False) 
    
    dfEvaluate['overall_accuracy'] = None
    correct_bills = 0
    for index, row in df.iterrows():
        if row['predicted_class'] == accessKeys[i]:
            correct_bills += 1
        if i == 18:
            if row['predicted_class'] == "FailedLords":
                correct_bills += 1
    total_bills = len(dfEvaluate)
    accuracy = correct_bills / total_bills
    
    print(F" Accuracy of {accessKeys[i]} is {accuracy}")
    
    

Df 28size 78
 Accuracy of SuccesCommons is 0.7564102564102564
Df 29size 29
 Accuracy of SuccessLords is 0.9655172413793104
Df 18size 243
 Accuracy of FailedCommons is 0.5925925925925926
Df 19size 73
 Accuracy of FailedLords is 0.6164383561643836


In [53]:

for i in remaining:
    filePath = f"{i}\\Training\\{i}_training.csv"
    path = os.path.join(basePath, filePath)
    df = pd.read_csv(path, error_bad_lines=False) 
    print(f"Df {i}size {len(df)}")
    #appendScoresToFile(df, models, path)
    dfEvaluate = pd.read_csv(path, error_bad_lines=False) 
    
    dfEvaluate['overall_accuracy'] = None
    correct_bills = 0
    for index, row in df.iterrows():
        if row['predicted_class'] == accessKeys[i]:
            correct_bills += 1
        if i == 18:
            if row['predicted_class'] == "FailedLords":
                correct_bills += 1
    total_bills = len(dfEvaluate)
    accuracy = correct_bills / total_bills
    
    print(F" Accuracy of {accessKeys[i]} is {accuracy}")
    
    



  exec(code_obj, self.user_global_ns, self.user_ns)


Df 28size 465
 Accuracy of SuccesCommons is 0.621505376344086
Df 29size 149
 Accuracy of SuccessLords is 0.7449664429530202
Df 18size 1938
 Accuracy of FailedCommons is 0.567079463364293
Df 19size 550
 Accuracy of FailedLords is 0.5872727272727273


In [59]:
import pandas as pd
import os


remaining = [28, 29, 18, 19]
accessKeys = {18: 'FailedCommons', 19: 'FailedLords', 28: "SuccesCommons", 29: "SuccessLords"}
basePath = r"C:\Users\ander\Downloads\MLP\CSVFilesWithTitleScores"


class_metrics = {key: {'TP': 0, 'FP': 0, 'FN': 0} for key in accessKeys}

for i in remaining:
    filePath = f"{i}\\Training\\{i}_training.csv"
    path = os.path.join(basePath, filePath)
    df = pd.read_csv(path, error_bad_lines=False)
    print(f"Df {i}size {len(df)}")


    correct_bills = 0
    for index, row in df.iterrows():
        predicted_class = row['predicted_class']
        actual_class = accessKeys[i]

  
        if  predicted_class == actual_class:
            correct_bills += 1
            class_metrics[i]['TP'] += 1
        if i == 18:
            if predicted_class == "FailedLords":
                correct_bills += 1
                class_metrics[i]['TP'] += 1
        else:
            class_metrics[i]['FN'] += 1
            if predicted_class in accessKeys.values():  # Check if the prediction is one of the known classes
                predicted_key = [k for k, v in accessKeys.items() if v == predicted_class][0]
                class_metrics[predicted_key]['FP'] += 1


    total_bills = len(df)
    accuracy = correct_bills / total_bills
    print(F"Accuracy of {accessKeys[i]} is {accuracy}")

for key, metrics in class_metrics.items():
    TP = metrics['TP']
    FP = metrics['FP']
    FN = metrics['FN']
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Metrics for {accessKeys[key]}: Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}")


Df 28size 465
Accuracy of SuccesCommons is 0.621505376344086
Df 29size 149
Accuracy of SuccessLords is 0.7449664429530202
Df 18size 1938
Accuracy of FailedCommons is 0.567079463364293
Df 19size 550
Accuracy of FailedLords is 0.5872727272727273
Metrics for FailedCommons: Precision: 0.9105219552609777, Recall: 1.0, F1 Score: 0.95316565481353
Metrics for FailedLords: Precision: 0.4581560283687943, Recall: 0.3699885452462772, F1 Score: 0.4093789607097592
Metrics for SuccesCommons: Precision: 0.43655589123867067, Recall: 0.383289124668435, F1 Score: 0.4081920903954802
Metrics for SuccessLords: Precision: 0.26941747572815533, Recall: 0.4269230769230769, F1 Score: 0.3303571428571428


In [60]:
for i in remaining:
    filePath = f"{i}\\Validation\\{i}_validation.csv"
    path = os.path.join(basePath, filePath)
    df = pd.read_csv(path, error_bad_lines=False)
    print(f"Df {i}size {len(df)}")


    correct_bills = 0
    for index, row in df.iterrows():
        predicted_class = row['predicted_class']
        actual_class = accessKeys[i]

  
        if  predicted_class == actual_class:
            correct_bills += 1
            class_metrics[i]['TP'] += 1
        if i == 18:
            if predicted_class == "FailedLords":
                correct_bills += 1
                class_metrics[i]['TP'] += 1
        else:
            class_metrics[i]['FN'] += 1
            if predicted_class in accessKeys.values():  # Check if the prediction is one of the known classes
                predicted_key = [k for k, v in accessKeys.items() if v == predicted_class][0]
                class_metrics[predicted_key]['FP'] += 1


    total_bills = len(df)
    accuracy = correct_bills / total_bills
    print(F"Accuracy of {accessKeys[i]} is {accuracy}")

for key, metrics in class_metrics.items():
    TP = metrics['TP']
    FP = metrics['FP']
    FN = metrics['FN']
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Metrics for {accessKeys[key]}: Precision: {precision}, Recall: {recall}, F1 Score: {f1_score}")

Df 28size 78
Accuracy of SuccesCommons is 0.7564102564102564
Df 29size 29
Accuracy of SuccessLords is 0.9655172413793104
Df 18size 243
Accuracy of FailedCommons is 0.5925925925925926
Df 19size 73
Accuracy of FailedLords is 0.6164383561643836
Metrics for FailedCommons: Precision: 0.9139705882352941, Recall: 1.0, F1 Score: 0.9550518632347291
Metrics for FailedLords: Precision: 0.45714285714285713, Recall: 0.3713420787083754, F1 Score: 0.40979955456570155
Metrics for SuccesCommons: Precision: 0.4393939393939394, Recall: 0.39057239057239057, F1 Score: 0.4135472370766488
Metrics for SuccessLords: Precision: 0.2865979381443299, Recall: 0.4384858044164038, F1 Score: 0.3466334164588528
