In [12]:
import json
from collections import Counter

# Load your JSON data
with open('data/filtered_utterances_ft_data.json', 'r') as file:
    data = json.load(file)

# Initialize a dictionary to hold the count of each value for each term
term_counts = {}

# Iterate through each item in the JSON data
for item in data:
    matched_terms = item['matched_terms']
    for term, value in matched_terms.items():
        if term not in term_counts:
            term_counts[term] = Counter()
        term_counts[term][value] += 1

# Output the results
for term, counts in term_counts.items():
    if len(counts)>1:
        print(f"Term: {term}")
        for value, count in counts.items():
            print(f"  {value}: {count}")

Term: know
  hedge: 2
  none: 2
Term: believe
  hedge: 1
  none: 1
Term: sure
  none: 1
  hedge: 1
Term: tend
  hedge: 1
  none: 2


In [39]:
import json

# Load your JSON data
with open('data/filtered_utterances_ft_data.json', 'r') as file:
    data = json.load(file)

word = "know"
# Filter data to include entries where 'matched_terms' has 'know' as a key
filtered_data = [item for item in data if word in item['matched_terms']]
statements = []
# Optionally, print the filtered data to see the result
for item in filtered_data:
    # Concatenate strings properly
    string = item["previous_statement"] + " statement: " + item["statement"]
    category = item["matched_terms"]["know"]
    # Remove '<' and '>' characters from the string
    string = string.replace("<", "")
    string = string.replace(">", "")
    statements.append((string, category, word))

In [17]:
from transformers import BertModel, BertTokenizer

model_name = "bert-base-uncased"

model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)


sample_text = """ "How do you suppose the royals even feel about this?",
        "statement": "WELL, one THING ABOUT -- WELL, I mean, William and Harry are OBVIOUSLY our main concern. The NLY NG William has really said   this is that when he reads  his mother, it's very rarely the person he actually knew. CERTAINLY the person who wrote this article is Tina Brown, who has made something of a career making a career out of claiming to be one of Diana's closest friends. Reading the article, I don't <<THINK>> Tina Brown knew Diana at ALL>. There's a hell of a lot ABOUT it that's just WRONG>. The <KIND OF> political aspects of it where Diana WOULD be politically, COMPLETELY WRONG>. It's somebody I don't <<THINK>> knew Diana at ALL>." """

encoded_input = tokenizer(sample_text, return_tensors = 'pt')

output = model(**encoded_input)

# print(output[0][:10])

In [43]:
from transformers import BertModel, BertTokenizer
import torch
from transformers import BertModel, BertTokenizer
from scipy.spatial.distance import cosine

# Load the pretrained BERT base uncased model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Sample text
# sample_texts = [""" "How do you suppose the royals even feel about this?",
#         "statement": "WELL, one THING ABOUT -- WELL, I mean, William and Harry are OBVIOUSLY our main concern. The ONLY THING William has really said about this is that when he reads about his mother, it's very rarely the person he actually knew. CERTAINLY the person who wrote this article is Tina Brown, who has made something of a career making a career out of claiming to be one of Diana's closest friends. Reading the article, I don't THINK Tina Brown knew Diana at ALL. There's a hell of a lot about it that's just WRONG. The KIND OF political aspects of it where Diana WOULD be politically, COMPLETELY WRONG. It's somebody I don't THINK knew Diana at ALL." """,        
#             """ "The reactions show that we have not come back to full local balance because when you're talking about the sustainable growth in all world regions, and I would also mean that any announcements of central bankers should not cause this dramatic fluctuations and should not actually cause those price changes. So Italy tells you how much that global economy depends on the action of central banks. And otherwise a calm and sustainable situation that obviously would not happen to that extent. So that shouldn't stop us to return back to the normal and sustainable path of development.",
#         "statement": "So, Richard, very <CLEAR> here that both Chancellor Merkel and President Putin support this move of trying to start making that transition out from the very generous money. He was speaking <ABOUT> Greece today and the selloff that we had. I posed a <QUESTION> to Ms. Merkel as <WELL> <ABOUT> whether it's time to <CHANGE> the policy of austerity and start to readjust here so we can tackle that youth unemployment in the southern half of Europe. She said very candidly we're not competitive enough just yet. The readjustment is not finished yet. And although it is extremely painful, we <SHOULD> not finish the job until it is <COMPLETELY> done which she suggests you <<COULD>> kick down, the can down the road for another few years but <WOULD> boomerang on the European Union, and she didn't <<<THINK>>> that was the correct strategy, Richard." """
#         ]
vectors = []
for statement in statements:
    sample_text = statement[0]
    classification = statement[1]
    word = statement[2]
    # Encode the text
    encoded_input = tokenizer(sample_text, return_tensors='pt')

    # Extract input ids and find the indices for "know"
    input_ids = encoded_input['input_ids'][0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    token_indices = [i for i, token in enumerate(tokens) if token == 'know']

    # Get the model output
    output = model(**encoded_input)

    # Extract embeddings for "know"
    token_embeddings = output.last_hidden_state[0, token_indices, :] if token_indices else None
    if token_embeddings is not None:
        vectors.append((token_embeddings.mean(dim=0).detach(),classification, word))  # Average if multiple "know" tokens and detach
    else:
        vectors.append((None, None, None))  # No "know" found in this sample

# Compute cosine similarity between all pairs of vectors
if len(vectors) > 1:
    for i in range(len(vectors)):
        for j in range(i + 1, len(vectors)):
            if vectors[i] is not None and vectors[j] is not None:
                cat1 = vectors[i][1]
                cat2 = vectors[j][1]
                word = vectors[i][2]
                similarity = 1 - cosine(vectors[i][0].numpy(), vectors[j][0].numpy())
                print(f"{word} {i+1},{j+1}: {cat1}/{cat2} Cosine similarity: {similarity:.4f}")
            else:
                print(f"One or both of the texts {i+1} and {j+1} do not contain the token 'know'.")

know 1,2: hedge/hedge Cosine similarity: 0.8926
know 1,3: hedge/none Cosine similarity: 0.7740
know 1,4: hedge/none Cosine similarity: 0.3465
know 2,3: hedge/none Cosine similarity: 0.8332
know 2,4: hedge/none Cosine similarity: 0.4173
know 3,4: none/none Cosine similarity: 0.3577
