In [None]:
# To look at trying to take intermediate layer, not simply the last layer of BERT embeddings

In [1]:
# Installing packages
import torch
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
from torch.nn.functional import cosine_similarity
import random
#!pip install spacy

In [2]:
# Code to load the BERT tokenizer + model - sanity check to check models can be loaded
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [17]:
# Function to chunk the text into smaller parts to meet limit of BERT model of 512 tokens + testing with Wealth of Nations by Adam Smith
def chunk_text(text, max_length=500):
    # Tokenize without truncation so we keep all tokens
    tokens = bert_tokenizer.tokenize(text)

    # Break into chunks
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk_tokens = tokens[i:i + max_length]
        chunk_text = bert_tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

# Read the text file of Wealth of Nations
with open('A_wealth_of_Nations_Cleaned.txt', 'r') as file:
    text = file.read()

# Get the chunks
chunks = chunk_text(text)

print(chunks[0])

adam smith an inquiry into the nature and causes of the wealth of nations introduction and plan of the work . the annual labour of every nation is the fund which originally supplies it with all the necessaries and conveniencies of life which it annually consumes , and which consist always either in the immediate produce of that labour , or in what is purchased with that produce from other nations . according , therefore , as this produce , or what is purchased with it , bears a greater or smaller proportion to the number of those who are to consume it , the nation will be better or worse supplied with all the necessaries and conveniencies for which it has occasion . but this proportion must in every nation be regulated by two different circumstances : first , by the skill , dexterity , and judgment with which its labour is generally applied ; and , secondly , by the proportion between the number of those who are employed in useful labour , and that of those who are not so employed . wh

In [18]:
print(chunks[50]) # sanity check

##ter , partly by the general circumstances of the society or neighbourhood in which the land is situated , and partly by the natural or improved fertility of the land . these ordinary or average rates may be called the natural rates of wages , profit and rent , at the time and place in which they commonly prevail . when the price of any commodity is neither more nor less than what is sufficient to pay the rent of the land , the wages of the labour , and the profits of the stock employed in raising , preparing , and bringing it to market , according to their natural rates , the commodity is then sold for what may be called its natural price . the commodity is then sold precisely for what it is worth , or for what it really costs the person who brings it to market ; for though , in common language , what is called the prime cost of any commodity does not comprehend the profit of the person who is to sell it again , yet , if he sells it at a price which does not allow him the ordinary ra

In [None]:
# ------------------------------------------------------------------------------------------------------------------------------------------------------
#
#
#
#                                                       Descriptive Analysis: TF-IDF
#
#
#
# ------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
# Installing spacy english model
#!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------ --------------------------------- 2.1/12.8 MB 13.1 MB/s eta 0:00:01
     ---------------- ----------------------- 5.2/12.8 MB 13.3 MB/s eta 0:00:01
     ------------------------- -------------- 8.1/12.8 MB 13.2 MB/s eta 0:00:01
     -------------------------------- ------ 10.7/12.8 MB 13.2 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 13.1 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 12.0 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
# Performing TF-IDF on the entire text of Wealth of Nations by Adam Smith and Communist Manifesto by Karl Marx
#   Aim: to extract intersect of economics terms (among the top 50 TF-IDF of each text)
import re
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 5000000 

# Reading texts separately
with open('A_wealth_of_Nations_Cleaned.txt', 'r', encoding='utf-8') as file:
    text1 = file.read()

with open('Communist Manifesto.txt', 'r', encoding='utf-8') as file:
    text2 = file.read()

# Text cleaning + lemmatization function
def preprocess(text):
    text = text.lower()                                # Convert all characters to lowercase
    text = re.sub(r'\d+', '', text)                    # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)                # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()           # Normalize whitespace
    
    doc = nlp(text)  # Processing the cleaned text using a spaCy NLP pipeline
    lemmas = [token.lemma_ for token in doc 
              if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-']
    return ' '.join(lemmas)

# Preprocessing each document
cleaned_texts = [preprocess(text1), preprocess(text2)]

# Performing TF-IDF vectorization across the two documents
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) # removes stopwords
X = vectorizer.fit_transform(cleaned_texts)
terms = vectorizer.get_feature_names_out()

# Getting TF-IDF scores for each doc
tfidf_scores_doc1 = X.toarray()[0]
tfidf_scores_doc2 = X.toarray()[1]

# Getting top terms per document
def get_top_terms(tfidf_scores, terms, top_n=50):
    top_indices = np.argsort(tfidf_scores)[::-1]
    return [(terms[i], tfidf_scores[i]) for i in top_indices[:top_n]]

top_terms_doc1 = get_top_terms(tfidf_scores_doc1, terms)
top_terms_doc2 = get_top_terms(tfidf_scores_doc2, terms)

# Displaying results
print("\nTop TF-IDF Terms in The Wealth of Nations:\n")
for word, score in top_terms_doc1:
    print(f"{word:<15} {score:.4f}")

print("\n Top TF-IDF Terms in The Communist Manifesto:\n")
for word, score in top_terms_doc2:
    print(f"{word:<15} {score:.4f}")



Top TF-IDF Terms in The Wealth of Nations:

great           0.4056
country         0.2599
price           0.1915
produce         0.1585
trade           0.1579
labour          0.1496
time            0.1420
revenue         0.1418
good            0.1414
land            0.1273
quantity        0.1249
different       0.1218
value           0.1200
pay             0.1177
stock           0.1171
people          0.1126
capital         0.1105
money           0.1093
profit          0.1060
employ          0.1022
year            0.0997
silver          0.0950
expense         0.0891
market          0.0889
corn            0.0878
increase        0.0846
tax             0.0842
proportion      0.0751
particular      0.0737
gold            0.0730
rent            0.0716
order           0.0705
foreign         0.0697
state           0.0685
manufacture     0.0682
present         0.0675
commodity       0.0674
man             0.0665
occasion        0.0655
annual          0.0646
suppose         0.0644
colony      

In [6]:
# Performing TF-IDF on the entire text of Wealth of Nations by Adam Smith and Communist Manifesto by Karl Marx
#   Aim: to extract intersect of economics terms (among the top 200 TF-IDF of each text)
import re
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 5000000 

# Reading texts separately
with open('A_wealth_of_Nations_Cleaned.txt', 'r', encoding='utf-8') as file:
    text1 = file.read()

with open('Communist Manifesto.txt', 'r', encoding='utf-8') as file:
    text2 = file.read()

# Text cleaning + lemmatization function
def preprocess(text):
    text = text.lower()                                # Convert all characters to lowercase
    text = re.sub(r'\d+', '', text)                    # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)                # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()           # Normalize whitespace
    
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc 
              if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-']
    return ' '.join(lemmas)

# Preprocessing each document
cleaned_texts = [preprocess(text1), preprocess(text2)]

# TF-IDF vectorization across the two documents
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000) # removes stopwords
X = vectorizer.fit_transform(cleaned_texts)
terms = vectorizer.get_feature_names_out()

# Getting TF-IDF scores for each doc
tfidf_scores_doc1 = X.toarray()[0]
tfidf_scores_doc2 = X.toarray()[1]

# Getting top terms per document
def get_top_terms(tfidf_scores, terms, top_n=200):
    top_indices = np.argsort(tfidf_scores)[::-1]
    return [(terms[i], tfidf_scores[i]) for i in top_indices[:top_n]]

top_terms_doc1 = get_top_terms(tfidf_scores_doc1, terms)
top_terms_doc2 = get_top_terms(tfidf_scores_doc2, terms)

# Displaying results
print("\nTop TF-IDF Terms in The Wealth of Nations:\n")
for word, score in top_terms_doc1:
    print(f"{word:<15} {score:.4f}")

print("\nTop TF-IDF Terms in The Communist Manifesto:\n")
for word, score in top_terms_doc2:
    print(f"{word:<15} {score:.4f}")



Top TF-IDF Terms in The Wealth of Nations:

great           0.4056
country         0.2599
price           0.1915
produce         0.1585
trade           0.1579
labour          0.1496
time            0.1420
revenue         0.1418
good            0.1414
land            0.1273
quantity        0.1249
different       0.1218
value           0.1200
pay             0.1177
stock           0.1171
people          0.1126
capital         0.1105
money           0.1093
profit          0.1060
employ          0.1022
year            0.0997
silver          0.0950
expense         0.0891
market          0.0889
corn            0.0878
increase        0.0846
tax             0.0842
proportion      0.0751
particular      0.0737
gold            0.0730
rent            0.0716
order           0.0705
foreign         0.0697
state           0.0685
manufacture     0.0682
present         0.0675
commodity       0.0674
man             0.0665
occasion        0.0655
annual          0.0646
suppose         0.0644
colony      

In [None]:
# Words extracted that are the intersection of the TF-IDF for top 200 words which are solely and unambiguously economic terms:

# - Produce
# - Trade
# - Labour
# - Labourer (excluded below as bert uses subword tokenization so this will never be detected and will instead be considered as labour)
# - Land
# - Capital
# - Market
# - Manufacture
# - Industry
# - Work
# - Government
# - Private
# - Wage
# - Demand
# - Exchange
# - Subsistence


In [None]:
# ------------------------------------------------------------------------------------------------------------------------------------------------------
#
#
#
#                                   Model: Getting contextual embeddings from BERT for economic terms extracted using TF-IDF
#
#
#
# ------------------------------------------------------------------------------------------------------------------------------------------------------

In [3]:
# Pairwise in-text cosine similarity of BERT contextual embeddings between all instances of each economic term in 
#   Wealth of Nations
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import json  # Optional: for exporting results

# Load model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

# Load text corpus
with open('A_wealth_of_Nations_Cleaned.txt', 'r') as file:
    corpus = file.read()

chunks = corpus.split("\n\n")  # Split by paragraphs
context_window = 5 # length of context window
max_len = 512 # Bert's max token input length
stride = 10  # Overlap between windows when chunking

target_words = [
    "produce", "trade", "labour", "land", "capital", "market", "manufacture", "industry", "work",
    "government", "private", "wage", "demand", "exchange", "subsistence"
]

# Dictionary to store similarities per word
similarities_dict_WoN = {}

# Function to process a sub-chunk of text and extract embeddings of all instances of the target word
def process_text_window(text, chunk_id, subchunk_id, target_word, occurrences):
    text = text.lower()  # Normalize to lowercase
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len)
    tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # Disabling gradient calc. 
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[0]

    # Finding index of where token matches target word
    word_indices = [i for i, token in enumerate(tokens) if token.lower() == target_word.lower()]

    # For each instance, storing the context and embedding
    for idx in word_indices:
        emb = embeddings[idx]
        start = max(idx - context_window, 0)
        end = min(idx + context_window + 1, len(tokens))
        context_tokens = tokens[start:end]
        context_text = bert_tokenizer.convert_tokens_to_string(context_tokens)
        occurrences.append({
            "chunk_id": chunk_id,
            "subchunk_id": subchunk_id,
            "token_idx": idx,
            "embedding": emb,
            "context": context_text
        })

# Function to process one target word + generate output
def process_target_word(target_word):
    occurrences = []
    similarities = []

    # Go through each paragraph chunk
    for chunk_id, chunk in enumerate(chunks):
        tokens = bert_tokenizer.tokenize(chunk)
        if len(tokens) <= max_len:
            text = bert_tokenizer.convert_tokens_to_string(tokens)
            process_text_window(text, chunk_id, subchunk_id=0, target_word=target_word, occurrences=occurrences)
        else:
            for i in range(0, len(tokens), max_len - stride):
                sub_tokens = tokens[i:i+max_len]
                text = bert_tokenizer.convert_tokens_to_string(sub_tokens)
                process_text_window(text, chunk_id, subchunk_id=i // (max_len - stride), target_word=target_word, occurrences=occurrences)

    if len(occurrences) < 2:
        print(f"Less than two total occurrences of '{target_word}' found.")
        similarities_dict_WoN[target_word] = []
        return

    output_lines = []
    total_similarity = 0.0
    count = 0

    output_lines.append(f"Total '{target_word}' occurrences found: {len(occurrences)}\n")

    # Compute similarities
    for i in range(len(occurrences)):
        for j in range(i + 1, len(occurrences)):
            emb1 = occurrences[i]["embedding"]
            emb2 = occurrences[j]["embedding"]
            similarity = F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
            similarities.append(similarity)

            total_similarity += similarity
            count += 1

            output_lines.append(f"Pair {count}:")
            output_lines.append(f" - Chunk IDs: {occurrences[i]['chunk_id']} & {occurrences[j]['chunk_id']}")
            output_lines.append(f" - Subchunk IDs: {occurrences[i]['subchunk_id']} & {occurrences[j]['subchunk_id']}")
            output_lines.append(f" - Token positions: {occurrences[i]['token_idx']} & {occurrences[j]['token_idx']}")
            output_lines.append(f" - Context 1: \"{occurrences[i]['context']}\"")
            output_lines.append(f" - Context 2: \"{occurrences[j]['context']}\"")
            output_lines.append(f" - Cosine Similarity: {similarity:.4f}\n")

    average_similarity = total_similarity / count
    output_lines.append(f"\nAverage Cosine Similarity across all pairs for '{target_word}': {average_similarity:.4f}")

    # Save output to file
    filename = f"output_WoN_pairwise_similarity_{target_word}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(output_lines))

    print(f"Results saved to {filename}")
    similarities_dict_WoN[target_word] = similarities

# Process each target word
for word in target_words:
    process_target_word(word)

Results saved to output_WoN_pairwise_similarity_produce.txt
Results saved to output_WoN_pairwise_similarity_trade.txt
Results saved to output_WoN_pairwise_similarity_labour.txt
Results saved to output_WoN_pairwise_similarity_land.txt
Results saved to output_WoN_pairwise_similarity_capital.txt
Results saved to output_WoN_pairwise_similarity_market.txt
Results saved to output_WoN_pairwise_similarity_manufacture.txt
Results saved to output_WoN_pairwise_similarity_industry.txt
Results saved to output_WoN_pairwise_similarity_work.txt
Results saved to output_WoN_pairwise_similarity_government.txt
Results saved to output_WoN_pairwise_similarity_private.txt
Results saved to output_WoN_pairwise_similarity_wage.txt
Results saved to output_WoN_pairwise_similarity_demand.txt
Results saved to output_WoN_pairwise_similarity_exchange.txt
Results saved to output_WoN_pairwise_similarity_subsistence.txt


In [7]:
# Saving the similarities of contextual embeddings for each word to an Excel file
import pandas as pd

# Function to pad lists in the dictionary so all have equal length ( for df conversion)
def pad_dict_values(data):
    max_len = max(len(v) for v in data.values())  # Find the longest list
    for key in data:
        while len(data[key]) < max_len:
            data[key].append(None)  # Pad shorter lists with None
    return data

# Padding the similarity lists to align lengths
padded_data = pad_dict_values(similarities_dict_WoN)

# Converting the padded dictionary to a DataFrame + export to Excel
df = pd.DataFrame(padded_data)
df.to_excel("similarities_dict_WoN.xlsx", index=False)

print("dict saved to excel")

dict saved to excel


In [4]:
# Pairwise in-text cosine similarity of embeddings between instances of each economic term in Communist Manifesto
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

# Load model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

# Load text corpus
with open('Communist Manifesto.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()

chunks = corpus.split("\n\n")  # Split by paragraphs
context_window = 5 # length of context window
max_len = 512 # Bert's max token input length
stride = 10  # Overlap between windows when chunking

target_words = ["produce", "trade", "labour", "land", "capital", "market", "manufacture", "industry", "work", "government", "private",
                "wage", "demand", "exchange", "subsistence"]  # List of words to analyze

# Dictionary to store similarities for each word
similarities_dict_CM = {}

# Function to process a sub-chunk of text and extract embeddings of all instances of the target word
def process_text_window(text, chunk_id, subchunk_id, target_word, occurrences):
    text = text.lower()  # make text lowercase to prevent case-sensitivity
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len)
    tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # Disabling grad calc.
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[0]

    # Finding instances of where the token matches the target word
    word_indices = [i for i, token in enumerate(tokens) if token.lower() == target_word.lower()]
    
    for idx in word_indices:
        emb = embeddings[idx]
        start = max(idx - context_window, 0)
        end = min(idx + context_window + 1, len(tokens))
        context_tokens = tokens[start:end]
        context_text = bert_tokenizer.convert_tokens_to_string(context_tokens)
        occurrences.append({
            "chunk_id": chunk_id,
            "subchunk_id": subchunk_id,
            "token_idx": idx,
            "embedding": emb,
            "context": context_text
        })

# Function to process one target word + generate output
def process_target_word(target_word):
    occurrences = []
    similarities = []  # List to hold cosine similarities for this word

    # Go through each paragraph chunk
    for chunk_id, chunk in enumerate(chunks):
        tokens = bert_tokenizer.tokenize(chunk)
        if len(tokens) <= max_len:
            text = bert_tokenizer.convert_tokens_to_string(tokens)
            process_text_window(text, chunk_id, subchunk_id=0, target_word=target_word, occurrences=occurrences)
        else:
            for i in range(0, len(tokens), max_len - stride):
                sub_tokens = tokens[i:i+max_len]
                text = bert_tokenizer.convert_tokens_to_string(sub_tokens)
                process_text_window(text, chunk_id, subchunk_id=i // (max_len - stride), target_word=target_word, occurrences=occurrences)

    if len(occurrences) < 2:
        print(f"Less than two total occurrences of '{target_word}' found.")
        similarities_dict_CM[target_word] = []
        return

    output_lines = []
    total_similarity = 0.0
    count = 0

    output_lines.append(f"Total '{target_word}' occurrences found: {len(occurrences)}\n")

    # Compute similarities
    for i in range(len(occurrences)):
        for j in range(i + 1, len(occurrences)):
            emb1 = occurrences[i]["embedding"]
            emb2 = occurrences[j]["embedding"]
            similarity = F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
            similarities.append(similarity)

            total_similarity += similarity
            count += 1

            output_lines.append(f"Pair {count}:")
            output_lines.append(f" - Chunk IDs: {occurrences[i]['chunk_id']} & {occurrences[j]['chunk_id']}")
            output_lines.append(f" - Subchunk IDs: {occurrences[i]['subchunk_id']} & {occurrences[j]['subchunk_id']}")
            output_lines.append(f" - Token positions: {occurrences[i]['token_idx']} & {occurrences[j]['token_idx']}")
            output_lines.append(f" - Context 1: \"{occurrences[i]['context']}\"")
            output_lines.append(f" - Context 2: \"{occurrences[j]['context']}\"")
            output_lines.append(f" - Cosine Similarity: {similarity:.4f}\n")

    average_similarity = total_similarity / count
    output_lines.append(f"\nAverage Cosine Similarity across all pairs for '{target_word}': {average_similarity:.4f}")

    # Save to file
    filename = f"output_CM_pairwise_similarity_{target_word}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(output_lines))

    print(f"Results saved to {filename}")
    similarities_dict_CM[target_word] = similarities  # Store similarities for this word

# Loop through all target words
for word in target_words:
    process_target_word(word)


Results saved to output_CM_pairwise_similarity_produce.txt
Results saved to output_CM_pairwise_similarity_trade.txt
Results saved to output_CM_pairwise_similarity_labour.txt
Results saved to output_CM_pairwise_similarity_land.txt
Results saved to output_CM_pairwise_similarity_capital.txt
Results saved to output_CM_pairwise_similarity_market.txt
Results saved to output_CM_pairwise_similarity_manufacture.txt
Results saved to output_CM_pairwise_similarity_industry.txt
Results saved to output_CM_pairwise_similarity_work.txt
Results saved to output_CM_pairwise_similarity_government.txt
Results saved to output_CM_pairwise_similarity_private.txt
Results saved to output_CM_pairwise_similarity_wage.txt
Results saved to output_CM_pairwise_similarity_demand.txt
Results saved to output_CM_pairwise_similarity_exchange.txt
Results saved to output_CM_pairwise_similarity_subsistence.txt


In [8]:
# Saving the similarities of contextual embeddings for each word to an Excel file
import pandas as pd

# Function to pad all lists (using None items) in the dictionary to the same length
def pad_dict_values(data):
    max_len = max(len(v) for v in data.values())  # Find the maximum list length
    for key in data:
        while len(data[key]) < max_len:
            data[key].append(None)  # Pad shorter lists with None for alignment
    return data

# Pad the lists in the dictionary for df conversion
padded_data = pad_dict_values(similarities_dict_CM)

# Convert the padded dictionary to a df + export it to an Excel file
df = pd.DataFrame(padded_data)
df.to_excel("similarities_dict_CM.xlsx", index=False)

print("dict saved to excel")

dict saved to excel


In [11]:
# Import necessary libraries
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

# Load the BERT tokenizer and model (pretrained, lowercase version)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode so it doesn’t compute gradients

# Load the two input texts for comparison
with open('A_wealth_of_Nations_Cleaned.txt', 'r') as file1, open('Communist Manifesto.txt', 'r', encoding='utf-8') as file2:
    text1 = file1.read()
    text2 = file2.read()

texts = [text1, text2]

# List of economic-related words we want to analyze in both texts
target_words = ["produce", "trade", "labour", "land", "capital", "market", "manufacture", 
                "industry", "work", "government", "private", "wage", "demand", "exchange", "subsistence"]

# Parameters for processing
context_window = 5   # How many tokens to include before and after the target word
max_len = 512        # Max number of tokens BERT can handle
stride = 10          # Overlap between sliding windows (to not miss words at the boundary)

# This will hold the cosine similarity scores for all the target words
all_similarities_cross = {}

# Goes through the text and extracts all relevant windows that might contain the target word
def process_text(text, text_index, target_word, occurrences):
    chunks = text.split("\n\n")  # Split the text into paragraphs
    for chunk_id, chunk in enumerate(chunks):
        tokens = bert_tokenizer.tokenize(chunk)
        # If the paragraph fits within BERT’s limit, process directly
        if len(tokens) <= max_len:
            input_text = bert_tokenizer.convert_tokens_to_string(tokens)
            process_text_window(input_text, text_index, chunk_id, subchunk_id=0, target_word=target_word, occurrences=occurrences)
        else:
            # If too long, split into overlapping subchunks (sliding window approach)
            for i in range(0, len(tokens), max_len - stride):
                sub_tokens = tokens[i:i+max_len]
                input_text = bert_tokenizer.convert_tokens_to_string(sub_tokens)
                process_text_window(input_text, text_index, chunk_id, subchunk_id=i // (max_len - stride), target_word=target_word, occurrences=occurrences)

# Looks at one chunk/subchunk, finds the target word, and grabs its embedding + context
def process_text_window(text, text_index, chunk_id, subchunk_id, target_word, occurrences):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len)
    tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    with torch.no_grad():  # Disable gradient computation (saves memory and time)
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[0]  # Last hidden layer of BERT for each token

    # Find all tokens that exactly match the target word
    word_indices = [i for i, token in enumerate(tokens) if token.lower() == target_word.lower()]
    
    for idx in word_indices:
        emb = embeddings[idx]
        # Grab some context around the word for reporting
        start = max(idx - context_window, 0)
        end = min(idx + context_window + 1, len(tokens))
        context_tokens = tokens[start:end]
        context_text = bert_tokenizer.convert_tokens_to_string(context_tokens)
        # Save everything we need about this occurrence
        occurrences[text_index].append({
            "chunk_id": chunk_id,
            "subchunk_id": subchunk_id,
            "token_idx": idx,
            "embedding": emb,
            "context": context_text
        })

# Basic cosine similarity between two vectors
def cosine_similarity(e1, e2):
    return F.cosine_similarity(e1.unsqueeze(0), e2.unsqueeze(0)).item()

# Loop over each target word and compare embeddings across the two texts
for target_word in target_words:
    print(f"\nProcessing word: '{target_word}'")
    occurrences = [[], []]  # Store occurrences separately for each text
    similarities = []       # Similarities for this word

    # Run the processing for both texts
    process_text(text1, 0, target_word, occurrences)
    process_text(text2, 1, target_word, occurrences)

    output_filename = f"output_pairwise_cross_{target_word}.txt"

    # If one of the texts doesn't have the word, skip it
    if len(occurrences[0]) == 0 or len(occurrences[1]) == 0:
        print(f"No matching '{target_word}' occurrences in both texts.")
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(f"No matching '{target_word}' occurrences in both texts.\n")
        all_similarities_cross[target_word] = []
    else:
        total_similarity = 0.0
        count = 0
        with open(output_filename, "w", encoding="utf-8") as f:
            # Compare every occurrence from text1 with every occurrence from text2
            for i, occ1 in enumerate(occurrences[0]):
                for j, occ2 in enumerate(occurrences[1]):
                    similarity = cosine_similarity(occ1["embedding"], occ2["embedding"])
                    similarities.append(similarity)
                    total_similarity += similarity
                    count += 1

                    # Write out the pair info and similarity score
                    f.write(f"\nPair {count}:\n")
                    f.write(f" - Text1 Chunk ID: {occ1['chunk_id']}, Subchunk ID: {occ1['subchunk_id']}\n")
                    f.write(f" - Text2 Chunk ID: {occ2['chunk_id']}, Subchunk ID: {occ2['subchunk_id']}\n")
                    f.write(f" - Context 1: \"{occ1['context']}\"\n")
                    f.write(f" - Context 2: \"{occ2['context']}\"\n")
                    f.write(f" - Cosine Similarity: {similarity:.4f}\n")

            # Compute and log the average similarity
            average_similarity = total_similarity / count if count > 0 else 0.0
            f.write(f"\n\nAverage Cross-Text Cosine Similarity for '{target_word}': {average_similarity:.4f}\n")

        print(f"Finished processing '{target_word}'. Saved to: {output_filename}")
        all_similarities_cross[target_word] = similarities



Processing word: 'produce'
Finished processing 'produce'. Saved to: output_pairwise_cross_produce.txt

Processing word: 'trade'
Finished processing 'trade'. Saved to: output_pairwise_cross_trade.txt

Processing word: 'labour'
Finished processing 'labour'. Saved to: output_pairwise_cross_labour.txt

Processing word: 'land'
Finished processing 'land'. Saved to: output_pairwise_cross_land.txt

Processing word: 'capital'
Finished processing 'capital'. Saved to: output_pairwise_cross_capital.txt

Processing word: 'market'
Finished processing 'market'. Saved to: output_pairwise_cross_market.txt

Processing word: 'manufacture'
Finished processing 'manufacture'. Saved to: output_pairwise_cross_manufacture.txt

Processing word: 'industry'
Finished processing 'industry'. Saved to: output_pairwise_cross_industry.txt

Processing word: 'work'
Finished processing 'work'. Saved to: output_pairwise_cross_work.txt

Processing word: 'government'
Finished processing 'government'. Saved to: output_pairwi

In [12]:
# Saving between-text semantic similarities to an Excel file
import pandas as pd

# Padding function to make all value lists the same length (so we can create a proper DataFrame)
def pad_dict_values(data):
    max_len = max(len(v) for v in data.values())  # Find the longest list
    for key in data:
        while len(data[key]) < max_len:  # Pad shorter lists with None
            data[key].append(None)
    return data

# Pad all the similarity lists so they're the same length
padded_data = pad_dict_values(all_similarities_cross)

# Turn the dictionary into a DataFrame + save it as an Excel file
df = pd.DataFrame(padded_data)
df.to_excel("all_similarities_cross.xlsx", index=False)

print("Dict saved to Excel.")

dict saved to excel


In [36]:
# Performing an independent t-test to compare within-text similarity (WoN) 
# vs. between-text similarity (WoN vs Communist Manifesto).
# Assumes approximately normal distributions, uses Welch’s t-test (no equal variance assumption)
from scipy.stats import ttest_ind

# List of economic terms we're running tests on
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("T-Test Results (Within WoN vs. Cross-Corpus):\n")

# Loop through each word to compare distributions
for word in target_words:
    # Get rid of any None values from the lists (these come from padding or missing entries)
    clean_within = [x for x in similarities_dict_WoN.get(word, []) if x is not None]
    clean_between = [x for x in all_similarities_cross.get(word, []) if x is not None]

    # Only run the t-test if both samples have enough data
    if len(clean_within) >= 2 and len(clean_between) >= 2:
        # Welch’s t-test: safer when variances might not be equal
        t_stat, p_value = ttest_ind(clean_within, clean_between, equal_var=False)
        print(f"{word:>12} | t-stat: {t_stat:>8.4f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for t-test")

T-Test Results (Within WoN vs. Cross-Corpus):

     produce | t-stat: 154.1733 | p-value: 0.00000000
       trade | t-stat: 194.6312 | p-value: 0.00000000
      labour | t-stat: 150.7831 | p-value: 0.00000000
        land | t-stat:  68.2359 | p-value: 0.00000000
     capital | t-stat: 222.9732 | p-value: 0.00000000
      market | t-stat:  62.4178 | p-value: 0.00000000
 manufacture | t-stat:  35.1947 | p-value: 0.00000000
    industry | t-stat: 145.8185 | p-value: 0.00000000
        work | t-stat:  76.8559 | p-value: 0.00000000
  government | t-stat:  74.2075 | p-value: 0.00000000
     private | t-stat:  43.2182 | p-value: 0.00000000
        wage | t-stat:   0.0172 | p-value: 0.98781184
      demand | t-stat:  24.6202 | p-value: 0.00000000
    exchange | t-stat:  21.8449 | p-value: 0.00000000
 subsistence | t-stat:   6.2857 | p-value: 0.00000000


In [14]:
# Performing a Mann–Whitney U test to compare within-text similarity (WoN) vs. 
# between-text similarity (WoN vs Communist Manifesto)
# This is a non-parametric test—doesn't assume normal distribution or equal variances
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the similarity data for within-text (WoN) and between-text (WoN vs CM)
df_within = pd.read_excel("similarities_dict_WoN.xlsx", sheet_name="Sheet1")
df_between = pd.read_excel("all_similarities_cross.xlsx", sheet_name="Sheet1")  # Update sheet name if needed

# Convert Excel data into dictionaries where each key is a word and value is a list of similarities
similarities_dict_WoN = df_within.to_dict(orient="list")
all_similarities_cross = df_between.to_dict(orient="list")

# Economic terms to analyze
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("Mann–Whitney U Test Results (Within WoN vs. Cross-Corpus):\n")

# Run the U-test for each word
for word in target_words:
    # Clean out any missing values (from Excel or padding)
    clean_within = [x for x in similarities_dict_WoN.get(word, []) if pd.notna(x)]
    clean_between = [x for x in all_similarities_cross.get(word, []) if pd.notna(x)]

    # Make sure we have enough data to run the test
    if len(clean_within) >= 2 and len(clean_between) >= 2:
        # Mann–Whitney U test (two-sided, compares rank distributions)
        u_stat, p_value = mannwhitneyu(clean_within, clean_between, alternative='two-sided')
        print(f"{word:>12} | U-stat: {u_stat:>8.2f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for U-test")

Mann–Whitney U Test Results (Within WoN vs. Cross-Corpus):

     produce | U-stat: 2627204890.00 | p-value: 0.00000000
       trade | U-stat: 7561147942.00 | p-value: 0.00000000
      labour | U-stat: 32212094685.00 | p-value: 0.00000000
        land | U-stat: 3141089493.50 | p-value: 0.00000000
     capital | U-stat: 6495454303.00 | p-value: 0.00000000
      market | U-stat: 716732611.00 | p-value: 0.00000000
 manufacture | U-stat: 9707907.00 | p-value: 0.00000000
    industry | U-stat: 2362048810.00 | p-value: 0.00000000
        work | U-stat: 1224402877.50 | p-value: 0.00000000
  government | U-stat: 511484700.00 | p-value: 0.00000000
     private | U-stat: 126710196.00 | p-value: 0.00000000
        wage | U-stat:    78.00 | p-value: 0.82308591
      demand | U-stat: 48983975.50 | p-value: 0.00000000
    exchange | U-stat: 44806872.50 | p-value: 0.00000000
 subsistence | U-stat: 20599472.50 | p-value: 0.00000000


In [15]:
# Performing an independent t-test to compare in-text similarities within the Communist Manifesto (CM)
# vs. between-text similarities (CM vs Wealth of Nations)
# Assumes normal distribution; using Welch’s version of t-test since it doesn't assume equal variance
import pandas as pd
from scipy.stats import ttest_ind

# Load the similarity data for CM internal pairs and CM vs WoN pairs
df_within = pd.read_excel("similarities_dict_CM.xlsx", sheet_name="Sheet1")
df_between = pd.read_excel("all_similarities_cross.xlsx", sheet_name="Sheet1")  # Change sheet name if needed

# Convert Excel data to dictionaries with structure: {word: [list of similarities]}
similarities_dict_CM = df_within.to_dict(orient="list")
all_similarities_cross = df_between.to_dict(orient="list")

# Economic terms to analyze
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("Independent t-Test Results (Within CM vs. Cross-Corpus):\n")

# Run the t-test for each target word
for word in target_words:
    # Filter out missing (NaN) values
    clean_within = [x for x in similarities_dict_CM.get(word, []) if pd.notna(x)]
    clean_between = [x for x in all_similarities_cross.get(word, []) if pd.notna(x)]

    # Make sure there’s enough data to run the test
    if len(clean_within) >= 2 and len(clean_between) >= 2:
        # Welch’s t-test (handles different variances between groups)
        t_stat, p_value = ttest_ind(clean_within, clean_between, equal_var=False)
        print(f"{word:>12} | t-stat: {t_stat:>8.4f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for t-test")

Independent t-Test Results (Within CM vs. Cross-Corpus):

     produce | t-stat:   3.7899 | p-value: 0.00114451
       trade | t-stat:   2.8193 | p-value: 0.00544596
      labour | t-stat:   9.7999 | p-value: 0.00000000
        land | t-stat:   3.0610 | p-value: 0.00238647
     capital | t-stat:  47.1627 | p-value: 0.00000000
      market | t-stat:   6.0964 | p-value: 0.00000023
 manufacture | t-stat:   7.4815 | p-value: 0.00000000
    industry | t-stat:  51.6814 | p-value: 0.00000000
        work | t-stat:  -5.0778 | p-value: 0.00000048
  government | t-stat:  17.4999 | p-value: 0.00000000
     private | t-stat:  41.2833 | p-value: 0.00000000
        wage | t-stat:   8.0916 | p-value: 0.00000000
      demand | t-stat:   1.0613 | p-value: 0.30098189
    exchange | t-stat:   6.7336 | p-value: 0.00000000
 subsistence | t-stat:  11.8584 | p-value: 0.00000000


In [16]:
# Performing a Mann–Whitney U test to compare in-text similarity within the Communist Manifesto (CM)
# vs. between-text similarity (CM vs Wealth of Nations)
# This is a non-parametric test, so it doesn't assume normal distribution or equal variances—good for skewed or noisy data

import pandas as pd
from scipy.stats import mannwhitneyu

# Load the similarity data from Excel
df_within = pd.read_excel("similarities_dict_CM.xlsx", sheet_name="Sheet1")
df_between = pd.read_excel("all_similarities_cross.xlsx", sheet_name="Sheet1")  # Update if sheet name is different

# Convert DataFrames to dictionaries: {word: [list of similarity scores]}
similarities_dict_CM = df_within.to_dict(orient="list")
all_similarities_cross = df_between.to_dict(orient="list")

# Economic terms we're analyzing
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("Mann–Whitney U Test Results (Within CM vs. Cross-Corpus):\n")

# Loop through each word and run the U-test
for word in target_words:
    # Remove any missing values (e.g., NaNs from Excel)
    clean_within = [x for x in similarities_dict_CM.get(word, []) if pd.notna(x)]
    clean_between = [x for x in all_similarities_cross.get(word, []) if pd.notna(x)]

    # Only run the test if both groups have enough samples
    if len(clean_within) >= 2 and len(clean_between) >= 2:
        # Run a two-sided Mann–Whitney U test (tests for difference in distributions)
        u_stat, p_value = mannwhitneyu(clean_within, clean_between, alternative='two-sided')
        print(f"{word:>12} | U-stat: {u_stat:>8.2f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for U-test")

Mann–Whitney U Test Results (Within CM vs. Cross-Corpus):

     produce | U-stat: 99607.00 | p-value: 0.00066500
       trade | U-stat: 1448453.00 | p-value: 0.11762789
      labour | U-stat: 99953712.50 | p-value: 0.00000000
        land | U-stat: 3531682.50 | p-value: 0.00000063
     capital | U-stat: 46578262.50 | p-value: 0.00000000
      market | U-stat: 196815.00 | p-value: 0.00000001
 manufacture | U-stat: 77505.00 | p-value: 0.00000000
    industry | U-stat: 108169241.50 | p-value: 0.00000000
        work | U-stat: 5389457.00 | p-value: 0.00000014
  government | U-stat: 8360798.50 | p-value: 0.00000000
     private | U-stat: 13250313.50 | p-value: 0.00000000
        wage | U-stat:  8016.00 | p-value: 0.00000000
      demand | U-stat: 21880.00 | p-value: 0.53528080
    exchange | U-stat: 277183.00 | p-value: 0.00000000
 subsistence | U-stat: 291989.50 | p-value: 0.00000000


In [None]:
# ------------------------------------------------------------------------------------------------------------------------------------------------------
#
#
#
#                                            Extension: Looking at the penultimate layer of BERT
#           Note: the code below is identical to the code above, the only alteration is that the hidden layers are extracted and the last
#                 one selected to extract the embeddings from.
#
#
# ------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
# Pairwise in-text cosine similarity of BERT contextual embeddings between all instances of each economic term in 
#   Wealth of Nations (Before-last layer)
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import json  # Optional: for exporting results

# Load model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model.eval()

# Load text corpus
with open('A_wealth_of_Nations_Cleaned.txt', 'r') as file:
    corpus = file.read()

chunks = corpus.split("\n\n")  # Split by paragraphs
context_window = 5
max_len = 512
stride = 10  # Overlap between windows

target_words = [
    "produce", "trade", "labour", "land", "capital", "market", "manufacture", "industry", "work",
    "government", "private", "wage", "demand", "exchange", "subsistence"
]

# Dictionary to store similarities per word
similarities_dict_WoN = {}

def process_text_window(text, chunk_id, subchunk_id, target_word, occurrences):
    text = text.lower()  # Normalize to lowercase
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len)
    tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.hidden_states[-2][0]
    
    word_indices = [i for i, token in enumerate(tokens) if token.lower() == target_word.lower()]
    
    for idx in word_indices:
        emb = embeddings[idx]
        start = max(idx - context_window, 0)
        end = min(idx + context_window + 1, len(tokens))
        context_tokens = tokens[start:end]
        context_text = bert_tokenizer.convert_tokens_to_string(context_tokens)
        occurrences.append({
            "chunk_id": chunk_id,
            "subchunk_id": subchunk_id,
            "token_idx": idx,
            "embedding": emb,
            "context": context_text
        })

# Function to process one target word
def process_target_word(target_word):
    occurrences = []
    similarities = []

    # Go through each paragraph chunk
    for chunk_id, chunk in enumerate(chunks):
        tokens = bert_tokenizer.tokenize(chunk)
        if len(tokens) <= max_len:
            text = bert_tokenizer.convert_tokens_to_string(tokens)
            process_text_window(text, chunk_id, subchunk_id=0, target_word=target_word, occurrences=occurrences)
        else:
            for i in range(0, len(tokens), max_len - stride):
                sub_tokens = tokens[i:i+max_len]
                text = bert_tokenizer.convert_tokens_to_string(sub_tokens)
                process_text_window(text, chunk_id, subchunk_id=i // (max_len - stride), target_word=target_word, occurrences=occurrences)

    if len(occurrences) < 2:
        print(f"Less than two total occurrences of '{target_word}' found.")
        similarities_dict_WoN[target_word] = []
        return

    output_lines = []
    total_similarity = 0.0
    count = 0

    output_lines.append(f"Total '{target_word}' occurrences found: {len(occurrences)}\n")

    # Compute similarities
    for i in range(len(occurrences)):
        for j in range(i + 1, len(occurrences)):
            emb1 = occurrences[i]["embedding"]
            emb2 = occurrences[j]["embedding"]
            similarity = F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
            similarities.append(similarity)

            total_similarity += similarity
            count += 1

            output_lines.append(f"Pair {count}:")
            output_lines.append(f" - Chunk IDs: {occurrences[i]['chunk_id']} & {occurrences[j]['chunk_id']}")
            output_lines.append(f" - Subchunk IDs: {occurrences[i]['subchunk_id']} & {occurrences[j]['subchunk_id']}")
            output_lines.append(f" - Token positions: {occurrences[i]['token_idx']} & {occurrences[j]['token_idx']}")
            output_lines.append(f" - Context 1: \"{occurrences[i]['context']}\"")
            output_lines.append(f" - Context 2: \"{occurrences[j]['context']}\"")
            output_lines.append(f" - Cosine Similarity: {similarity:.4f}\n")

    average_similarity = total_similarity / count
    output_lines.append(f"\nAverage Cosine Similarity across all pairs for '{target_word}': {average_similarity:.4f}")

    # Save output to file
    filename = f"output_WoN_pairwise_similarity_BLLayer_{target_word}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(output_lines))

    print(f"Results saved to {filename}")
    similarities_dict_WoN[target_word] = similarities

# Process each target word
for word in target_words:
    process_target_word(word)

Results saved to output_WoN_pairwise_similarity_BLLayer_produce.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_trade.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_labour.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_land.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_capital.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_market.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_manufacture.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_industry.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_work.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_government.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_private.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_wage.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_demand.txt
Results saved to output_WoN_pairwise_similarity_BLLayer_exchange.txt
Results saved to output_WoN_pairwise_similarity_BLLay

In [21]:
# Saving the similarities of contextual embeddings for each word to an excel file (Before last BERT layer)
import pandas as pd

def pad_dict_values(data):
    max_len = max(len(v) for v in data.values())
    for key in data:
        while len(data[key]) < max_len:
            data[key].append(None)
    return data

# Pad the lists
padded_data = pad_dict_values(similarities_dict_WoN)

# Convert to DataFrame and save to Excel
df = pd.DataFrame(padded_data)
df.to_excel("similarities_dict_WoN_BLLayer.xlsx", index=False)

print("dict saved to excel")

dict saved to excel


In [22]:
# Pairwise in-text cosine similarity of embeddings between instances of each economic term in Communist Manifesto 
#     (Before last BERT layer)
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

# Load model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model.eval()

# Load text corpus
with open('Communist Manifesto.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()

chunks = corpus.split("\n\n")  # Split by paragraphs
context_window = 5
max_len = 512
stride = 10  # Overlap between windows

target_words = ["produce", "trade", "labour", "land", "capital", "market", "manufacture", "industry", "work", "government", "private",
                "wage", "demand", "exchange", "subsistence"]  # List of words to analyze

# Dictionary to store similarities for each word
similarities_dict_CM = {}

def process_text_window(text, chunk_id, subchunk_id, target_word, occurrences):
    text = text.lower()  # make text lowercase to prevent case-sensitivity
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len)
    tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.hidden_states[-2][0]
    
    word_indices = [i for i, token in enumerate(tokens) if token.lower() == target_word.lower()]
    
    for idx in word_indices:
        emb = embeddings[idx]
        start = max(idx - context_window, 0)
        end = min(idx + context_window + 1, len(tokens))
        context_tokens = tokens[start:end]
        context_text = bert_tokenizer.convert_tokens_to_string(context_tokens)
        occurrences.append({
            "chunk_id": chunk_id,
            "subchunk_id": subchunk_id,
            "token_idx": idx,
            "embedding": emb,
            "context": context_text
        })

# Function to process one target word
def process_target_word(target_word):
    occurrences = []
    similarities = []  # List to hold cosine similarities for this word

    # Go through each paragraph chunk
    for chunk_id, chunk in enumerate(chunks):
        tokens = bert_tokenizer.tokenize(chunk)
        if len(tokens) <= max_len:
            text = bert_tokenizer.convert_tokens_to_string(tokens)
            process_text_window(text, chunk_id, subchunk_id=0, target_word=target_word, occurrences=occurrences)
        else:
            for i in range(0, len(tokens), max_len - stride):
                sub_tokens = tokens[i:i+max_len]
                text = bert_tokenizer.convert_tokens_to_string(sub_tokens)
                process_text_window(text, chunk_id, subchunk_id=i // (max_len - stride), target_word=target_word, occurrences=occurrences)

    if len(occurrences) < 2:
        print(f"Less than two total occurrences of '{target_word}' found.")
        similarities_dict_CM[target_word] = []
        return

    output_lines = []
    total_similarity = 0.0
    count = 0

    output_lines.append(f"Total '{target_word}' occurrences found: {len(occurrences)}\n")

    # Compute similarities
    for i in range(len(occurrences)):
        for j in range(i + 1, len(occurrences)):
            emb1 = occurrences[i]["embedding"]
            emb2 = occurrences[j]["embedding"]
            similarity = F.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
            similarities.append(similarity)

            total_similarity += similarity
            count += 1

            output_lines.append(f"Pair {count}:")
            output_lines.append(f" - Chunk IDs: {occurrences[i]['chunk_id']} & {occurrences[j]['chunk_id']}")
            output_lines.append(f" - Subchunk IDs: {occurrences[i]['subchunk_id']} & {occurrences[j]['subchunk_id']}")
            output_lines.append(f" - Token positions: {occurrences[i]['token_idx']} & {occurrences[j]['token_idx']}")
            output_lines.append(f" - Context 1: \"{occurrences[i]['context']}\"")
            output_lines.append(f" - Context 2: \"{occurrences[j]['context']}\"")
            output_lines.append(f" - Cosine Similarity: {similarity:.4f}\n")

    average_similarity = total_similarity / count
    output_lines.append(f"\nAverage Cosine Similarity across all pairs for '{target_word}': {average_similarity:.4f}")

    # Save to file
    filename = f"output_CM_pairwise_similarity_BLLayer_{target_word}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(output_lines))

    print(f"Results saved to {filename}")
    similarities_dict_CM[target_word] = similarities  # Store similarities for this word

# Loop through all target words
for word in target_words:
    process_target_word(word)

Results saved to output_CM_pairwise_similarity_BLLayer_produce.txt
Results saved to output_CM_pairwise_similarity_BLLayer_trade.txt
Results saved to output_CM_pairwise_similarity_BLLayer_labour.txt
Results saved to output_CM_pairwise_similarity_BLLayer_land.txt
Results saved to output_CM_pairwise_similarity_BLLayer_capital.txt
Results saved to output_CM_pairwise_similarity_BLLayer_market.txt
Results saved to output_CM_pairwise_similarity_BLLayer_manufacture.txt
Results saved to output_CM_pairwise_similarity_BLLayer_industry.txt
Results saved to output_CM_pairwise_similarity_BLLayer_work.txt
Results saved to output_CM_pairwise_similarity_BLLayer_government.txt
Results saved to output_CM_pairwise_similarity_BLLayer_private.txt
Results saved to output_CM_pairwise_similarity_BLLayer_wage.txt
Results saved to output_CM_pairwise_similarity_BLLayer_demand.txt
Results saved to output_CM_pairwise_similarity_BLLayer_exchange.txt
Results saved to output_CM_pairwise_similarity_BLLayer_subsistence.

In [23]:
# Saving the similarities of contextual embeddings for each word to an excel file (Before last layer)
import pandas as pd

def pad_dict_values(data):
    max_len = max(len(v) for v in data.values())
    for key in data:
        while len(data[key]) < max_len:
            data[key].append(None)
    return data

# Pad the lists
padded_data = pad_dict_values(similarities_dict_CM)

# Convert to DataFrame and save to Excel
df = pd.DataFrame(padded_data)
df.to_excel("similarities_dict_CM_BLLayer.xlsx", index=False)

print("dict saved to excel")

dict saved to excel


In [24]:
# Pairwise between-text cosine similarity of embeddings between instances of each economic term in both texts (Before last layer)
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel

# Load model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
model.eval()

# Load two separate texts
with open('A_wealth_of_Nations_Cleaned.txt', 'r') as file1, open('Communist Manifesto.txt', 'r', encoding='utf-8') as file2:
    text1 = file1.read()
    text2 = file2.read()

texts = [text1, text2]
target_words = ["produce", "trade", "labour", "land", "capital", "market", "manufacture", "industry", "work", "government", "private",
                "wage", "demand", "exchange", "subsistence"]
context_window = 5
max_len = 512
stride = 10

# Dictionary to store all cosine similarities for each word
all_similarities_cross = {}

def process_text(text, text_index, target_word, occurrences):
    chunks = text.split("\n\n")
    for chunk_id, chunk in enumerate(chunks):
        tokens = bert_tokenizer.tokenize(chunk)
        if len(tokens) <= max_len:
            input_text = bert_tokenizer.convert_tokens_to_string(tokens)
            process_text_window(input_text, text_index, chunk_id, subchunk_id=0, target_word=target_word, occurrences=occurrences)
        else:
            for i in range(0, len(tokens), max_len - stride):
                sub_tokens = tokens[i:i+max_len]
                input_text = bert_tokenizer.convert_tokens_to_string(sub_tokens)
                process_text_window(input_text, text_index, chunk_id, subchunk_id=i // (max_len - stride), target_word=target_word, occurrences=occurrences)

def process_text_window(text, text_index, chunk_id, subchunk_id, target_word, occurrences):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len)
    tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.hidden_states[-2][0]

    word_indices = [i for i, token in enumerate(tokens) if token.lower() == target_word.lower()]
    
    for idx in word_indices:
        emb = embeddings[idx]
        start = max(idx - context_window, 0)
        end = min(idx + context_window + 1, len(tokens))
        context_tokens = tokens[start:end]
        context_text = bert_tokenizer.convert_tokens_to_string(context_tokens)
        occurrences[text_index].append({
            "chunk_id": chunk_id,
            "subchunk_id": subchunk_id,
            "token_idx": idx,
            "embedding": emb,
            "context": context_text
        })

def cosine_similarity(e1, e2):
    return F.cosine_similarity(e1.unsqueeze(0), e2.unsqueeze(0)).item()

# Loop through each target word
for target_word in target_words:
    print(f"\nProcessing word: '{target_word}'")
    occurrences = [[], []]  # Reset for each word
    similarities = []  # Store similarities for current word

    process_text(text1, 0, target_word, occurrences)
    process_text(text2, 1, target_word, occurrences)

    output_filename = f"output_pairwise_cross_BLLayer_{target_word}.txt"

    if len(occurrences[0]) == 0 or len(occurrences[1]) == 0:
        print(f"No matching '{target_word}' occurrences in both texts.")
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(f"No matching '{target_word}' occurrences in both texts.\n")
        all_similarities_cross[target_word] = []
    else:
        total_similarity = 0.0
        count = 0
        with open(output_filename, "w", encoding="utf-8") as f:
            for i, occ1 in enumerate(occurrences[0]):
                for j, occ2 in enumerate(occurrences[1]):
                    similarity = cosine_similarity(occ1["embedding"], occ2["embedding"])
                    similarities.append(similarity)
                    total_similarity += similarity
                    count += 1

                    f.write(f"\nPair {count}:\n")
                    f.write(f" - Text1 Chunk ID: {occ1['chunk_id']}, Subchunk ID: {occ1['subchunk_id']}\n")
                    f.write(f" - Text2 Chunk ID: {occ2['chunk_id']}, Subchunk ID: {occ2['subchunk_id']}\n")
                    f.write(f" - Context 1: \"{occ1['context']}\"\n")
                    f.write(f" - Context 2: \"{occ2['context']}\"\n")
                    f.write(f" - Cosine Similarity: {similarity:.4f}\n")
            
            average_similarity = total_similarity / count if count > 0 else 0.0
            f.write(f"\n\nAverage Cross-Text Cosine Similarity for '{target_word}': {average_similarity:.4f}\n")

        print(f"Finished processing '{target_word}'. Saved to: {output_filename}")
        all_similarities_cross[target_word] = similarities


Processing word: 'produce'
Finished processing 'produce'. Saved to: output_pairwise_cross_BLLayer_produce.txt

Processing word: 'trade'
Finished processing 'trade'. Saved to: output_pairwise_cross_BLLayer_trade.txt

Processing word: 'labour'
Finished processing 'labour'. Saved to: output_pairwise_cross_BLLayer_labour.txt

Processing word: 'land'
Finished processing 'land'. Saved to: output_pairwise_cross_BLLayer_land.txt

Processing word: 'capital'
Finished processing 'capital'. Saved to: output_pairwise_cross_BLLayer_capital.txt

Processing word: 'market'
Finished processing 'market'. Saved to: output_pairwise_cross_BLLayer_market.txt

Processing word: 'manufacture'
Finished processing 'manufacture'. Saved to: output_pairwise_cross_BLLayer_manufacture.txt

Processing word: 'industry'
Finished processing 'industry'. Saved to: output_pairwise_cross_BLLayer_industry.txt

Processing word: 'work'
Finished processing 'work'. Saved to: output_pairwise_cross_BLLayer_work.txt

Processing word

In [25]:
# Saving between-text semantic similarities to an excel file (Before last layer)
import pandas as pd

def pad_dict_values(data):
    max_len = max(len(v) for v in data.values())
    for key in data:
        while len(data[key]) < max_len:
            data[key].append(None)
    return data

# Pad the lists
padded_data = pad_dict_values(all_similarities_cross)

# Convert to DataFrame and save to Excel
df = pd.DataFrame(padded_data)
df.to_excel("all_similarities_cross_BLLayer.xlsx", index=False)

print("dict saved to excel")

dict saved to excel


In [26]:
# Performing an Independent t-test (assumes normality and equal variances) of WoN in-text vs Between-text (with CM):
#   (Before last layer)
from scipy.stats import ttest_ind

# List of target words to analyze
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("T-Test Results (Within WoN vs. Cross-Corpus):\n")

for word in target_words:
    # Clean the similarity lists of None items
    clean_within = [x for x in similarities_dict_WoN.get(word, []) if x is not None]
    clean_between = [x for x in all_similarities_cross.get(word, []) if x is not None]

    if len(clean_within) >= 2 and len(clean_between) >= 2:
        t_stat, p_value = ttest_ind(clean_within, clean_between, equal_var=False)  # Welch’s t-test
        print(f"{word:>12} | t-stat: {t_stat:>8.4f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for t-test")

T-Test Results (Within WoN vs. Cross-Corpus):

     produce | t-stat: 132.9392 | p-value: 0.00000000
       trade | t-stat: 142.4930 | p-value: 0.00000000
      labour | t-stat: 111.8835 | p-value: 0.00000000
        land | t-stat:  43.0238 | p-value: 0.00000000
     capital | t-stat: 117.5366 | p-value: 0.00000000
      market | t-stat:  33.7599 | p-value: 0.00000000
 manufacture | t-stat:  24.1471 | p-value: 0.00000000
    industry | t-stat:  97.6770 | p-value: 0.00000000
        work | t-stat:  56.5666 | p-value: 0.00000000
  government | t-stat:  61.1348 | p-value: 0.00000000
     private | t-stat:  44.7714 | p-value: 0.00000000
        wage | t-stat:   0.0155 | p-value: 0.98903855
      demand | t-stat:  17.1447 | p-value: 0.00000000
    exchange | t-stat:   3.8294 | p-value: 0.00013001
 subsistence | t-stat:   3.7976 | p-value: 0.00014811


In [27]:
# Performing a Mann–Whitney U test (non-parametric) of WoN in-text vs Between-text (with CM):
#   (Before last layer)
from scipy.stats import mannwhitneyu

# List of target words to analyze
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("Mann–Whitney U Test Results (Within WoN vs. Cross-Corpus):\n")

for word in target_words:
    # Clean the similarity lists of None items
    clean_within = [x for x in similarities_dict_WoN.get(word, []) if x is not None]
    clean_between = [x for x in all_similarities_cross.get(word, []) if x is not None]

    if len(clean_within) >= 2 and len(clean_between) >= 2:
        u_stat, p_value = mannwhitneyu(clean_within, clean_between, alternative='two-sided')
        print(f"{word:>12} | U-stat: {u_stat:>8.2f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for U-test")


Mann–Whitney U Test Results (Within WoN vs. Cross-Corpus):

     produce | U-stat: 2562424121.00 | p-value: 0.00000000
       trade | U-stat: 6888655491.00 | p-value: 0.00000000
      labour | U-stat: 29889434032.00 | p-value: 0.00000000
        land | U-stat: 2846635268.50 | p-value: 0.00000000
     capital | U-stat: 5459039826.50 | p-value: 0.00000000
      market | U-stat: 608345930.00 | p-value: 0.00000000
 manufacture | U-stat: 8667709.50 | p-value: 0.00000000
    industry | U-stat: 2103772711.00 | p-value: 0.00000000
        work | U-stat: 1131213585.50 | p-value: 0.00000000
  government | U-stat: 485735819.50 | p-value: 0.00000000
     private | U-stat: 127041938.50 | p-value: 0.00000000
        wage | U-stat:    93.00 | p-value: 0.82308591
      demand | U-stat: 45438448.50 | p-value: 0.00000000
    exchange | U-stat: 36900535.50 | p-value: 0.21272192
 subsistence | U-stat: 19370140.50 | p-value: 0.00000105


In [28]:
# Performing an Independent t-test (assumes normality and equal variances) of CM in-text vs Between-text (with WoN):
#   (Before last layer)
from scipy.stats import ttest_ind

# List of target words to analyze
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("T-Test Results (Within CM vs. Cross-Corpus):\n")

for word in target_words:
    # Clean the similarity lists of None items
    clean_within = [x for x in similarities_dict_CM.get(word, []) if x is not None]
    clean_between = [x for x in all_similarities_cross.get(word, []) if x is not None]

    if len(clean_within) >= 2 and len(clean_between) >= 2:
        t_stat, p_value = ttest_ind(clean_within, clean_between, equal_var=False)  # Welch’s t-test
        print(f"{word:>12} | t-stat: {t_stat:>8.4f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for t-test")

T-Test Results (Within CM vs. Cross-Corpus):

     produce | t-stat:   4.4446 | p-value: 0.00024689
       trade | t-stat:   4.2951 | p-value: 0.00003078
      labour | t-stat:   9.7747 | p-value: 0.00000000
        land | t-stat:   2.9383 | p-value: 0.00353179
     capital | t-stat:  45.6676 | p-value: 0.00000000
      market | t-stat:   6.4501 | p-value: 0.00000007
 manufacture | t-stat:   7.0934 | p-value: 0.00000000
    industry | t-stat:  42.1114 | p-value: 0.00000000
        work | t-stat:  -3.7112 | p-value: 0.00022136
  government | t-stat:  13.0559 | p-value: 0.00000000
     private | t-stat:  34.3947 | p-value: 0.00000000
        wage | t-stat:   6.2555 | p-value: 0.00000003
      demand | t-stat:   2.0151 | p-value: 0.05721642
    exchange | t-stat:   6.7124 | p-value: 0.00000000
 subsistence | t-stat:  11.3956 | p-value: 0.00000000


In [29]:
# Performing a Mann–Whitney U test (non-parametric) of CM in-text vs Between-text (with WoN):
#   (Before last layer)
from scipy.stats import mannwhitneyu

# List of target words to analyze
target_words = [
    "produce", "trade", "labour", "land", "capital", "market",
    "manufacture", "industry", "work", "government", "private",
    "wage", "demand", "exchange", "subsistence"
]

print("Mann–Whitney U Test Results (Within CM vs. Cross-Corpus):\n")

for word in target_words:
    # Clean the similarity lists of None items
    clean_within = [x for x in similarities_dict_CM.get(word, []) if x is not None]
    clean_between = [x for x in all_similarities_cross.get(word, []) if x is not None]

    if len(clean_within) >= 2 and len(clean_between) >= 2:
        u_stat, p_value = mannwhitneyu(clean_within, clean_between, alternative='two-sided')
        print(f"{word:>12} | U-stat: {u_stat:>8.2f} | p-value: {p_value:.8f}")
    else:
        print(f"{word:>12} | Insufficient data for U-test")


Mann–Whitney U Test Results (Within CM vs. Cross-Corpus):

     produce | U-stat: 104932.00 | p-value: 0.00006092
       trade | U-stat: 1542919.00 | p-value: 0.00222772
      labour | U-stat: 99281825.00 | p-value: 0.00000000
        land | U-stat: 3532333.50 | p-value: 0.00000060
     capital | U-stat: 45494096.50 | p-value: 0.00000000
      market | U-stat: 197960.00 | p-value: 0.00000001
 manufacture | U-stat: 74725.00 | p-value: 0.00000000
    industry | U-stat: 102298326.50 | p-value: 0.00000000
        work | U-stat: 5557546.00 | p-value: 0.00005545
  government | U-stat: 7894068.50 | p-value: 0.00000000
     private | U-stat: 12703990.00 | p-value: 0.00000000
        wage | U-stat:  7201.00 | p-value: 0.00000007
      demand | U-stat: 23923.00 | p-value: 0.15712363
    exchange | U-stat: 278702.00 | p-value: 0.00000000
 subsistence | U-stat: 287709.50 | p-value: 0.00000000
