In [10]:
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

API_KEY = os.getenv("API_KEY")

In [None]:
import pymongo
from pymongo import MongoClient

CONNECTION_STRING = os.getenv("CONNECTION_STRING")

# Initialize the MongoDB client
client = MongoClient(CONNECTION_STRING)

# Connect to the database and collection
db = client['embedding_database']
collection = db['bert_embeddings']

In [2]:
# Checks if the file is being read correctly. Returns the first 5 lines, including the titleheader

def peek_file(file_path, num_lines=5):
    
    with open(file_path, 'r', encoding='utf-8') as file:
        
        for _ in range(num_lines):
            
            print(file.readline().strip())

print("File contents:")
peek_file('./TempoWordNet/TempoWnL_1.0.txt')

File contents:
#ID	Synset_name	POS	Synset_gloss  Prob_of_being_Past	Prob_of_being_Present	Prob_of_being_Future	Prob_of_being_Atemporal
1740	able.a.01	a	(usually followed by `to') having the necessary means or skill or know-how or authority to do something	0.0	0.002	0.998	0
2098	unable.a.01	a	(usually followed by `to') not having the necessary means or skill or know-how	0.0	0.001798	0.897202	0.101
2312	abaxial.a.01	a	facing away from the axis of an organ or organism	0.004	0.345	0.651	0
2527	adaxial.a.01	a	nearest to or facing toward the axis of an organ or organism	0.002	0.0	0.998	0


In [3]:
# Checks if the probabilities are being extracted properly

def read_tempowordnet_manually(file_path):
    # initiliase empty data list
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Skip comments
            if line.startswith('#'):
                continue
            
            # Manually split the line
            # Strips leading and trailing whitespace from the line and then splits it into a list of words (or fields) using whitespace as the delimiter.
            # split_line = line.strip().split()
            split_line = line.strip().split('\t')

            # Extract the temporal probability fields from the end
            # cannot find a better alternative to split_line >=8
            if len(split_line) >= 8:
                id_field = split_line[0]
                synset_name_field = split_line[1]
                pos_field = split_line[2]
                
                # Gloss is all fields between POS and the first probability field
                gloss_field = ' '.join(split_line[3:-4])
                
                # Temporal probability fields
                prob_past = split_line[-4]
                prob_present = split_line[-3]
                prob_future = split_line[-2]
                prob_atemporal = split_line[-1]
                
                # Construct row data
                row = {
                    "ID": id_field,
                    "Synset_name": synset_name_field,
                    "POS": pos_field,
                    "Synset_gloss": gloss_field,
                    "Prob_of_being_Past": prob_past,
                    "Prob_of_being_Present": prob_present,
                    "Prob_of_being_Future": prob_future,
                    "Prob_of_being_Atemporal": prob_atemporal
                }
                data.append(row)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

tempowordnet_df = read_tempowordnet_manually('./TempoWordNet/TempoWnL_1.0.txt')

print("\nDataFrame Info:")
print(tempowordnet_df.info())
print("\nFirst few rows of the DataFrame:")
print(tempowordnet_df.head())



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117654 entries, 0 to 117653
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   ID                       117654 non-null  object
 1   Synset_name              117654 non-null  object
 2   POS                      117654 non-null  object
 3   Synset_gloss             117654 non-null  object
 4   Prob_of_being_Past       117654 non-null  object
 5   Prob_of_being_Present    117654 non-null  object
 6   Prob_of_being_Future     117654 non-null  object
 7   Prob_of_being_Atemporal  117654 non-null  object
dtypes: object(8)
memory usage: 7.2+ MB
None

First few rows of the DataFrame:
     ID      Synset_name POS  \
0  1740        able.a.01   a   
1  2098      unable.a.01   a   
2  2312     abaxial.a.01   a   
3  2527     adaxial.a.01   a   
4  2730  acroscopic.a.01   a   

                                        Synset_gloss Prob_of_

In [4]:
# Create 'Word' column
tempowordnet_df['Word'] = tempowordnet_df['Synset_name'].str.split('.').str[0]

# Convert probability columns to numeric
prob_columns = ["Prob_of_being_Past", "Prob_of_being_Present", "Prob_of_being_Future", "Prob_of_being_Atemporal"]
for col in prob_columns:
    tempowordnet_df[col] = pd.to_numeric(tempowordnet_df[col], errors='coerce')

print("\nUpdated DataFrame Info:")
print(tempowordnet_df.info())
print("\nFirst few rows of the updated DataFrame:")
print(tempowordnet_df.head())


Updated DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117654 entries, 0 to 117653
Data columns (total 9 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       117654 non-null  object 
 1   Synset_name              117654 non-null  object 
 2   POS                      117654 non-null  object 
 3   Synset_gloss             117654 non-null  object 
 4   Prob_of_being_Past       117654 non-null  float64
 5   Prob_of_being_Present    117654 non-null  float64
 6   Prob_of_being_Future     117654 non-null  float64
 7   Prob_of_being_Atemporal  117654 non-null  float64
 8   Word                     117654 non-null  object 
dtypes: float64(4), object(5)
memory usage: 8.1+ MB
None

First few rows of the updated DataFrame:
     ID      Synset_name POS  \
0  1740        able.a.01   a   
1  2098      unable.a.01   a   
2  2312     abaxial.a.01   a   
3  2527     adaxial.a.01   a   
4

In [None]:
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import pinecone
import torch

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Initialize Pinecone service
pinecone.init(api_key=API_KEY, environment='us-west1-gcp')

index = pinecone.Index('tempowordnet') # what is this line?

In [1]:
# Function for BERT embedding for a word
def get_bert_embedding(word):
    # Tokenize the word and convert it into input format suitable for BERT
    encoded_input = tokenizer(word, return_tensors='pt')
    # Pass the tokenized input through the BERT model to get the output embeddings
    output = model(**encoded_input)
    # Get the last hidden state and take the mean of the embeddings
    return output.last_hidden_state.mean(dim=1).detach().numpy()

def insert_embedding_to_mongo(word):
    # Get the embedding for the word
    embedding = get_bert_embedding(word).tolist()[0]
    # Create a document to insert
    document = {
        'word': word,
        'embedding': embedding
    }
    # Insert the document into the MongoDB collection
    collection.insert_one(document)

# Function to read TempoWordNet manually
def read_tempowordnet_manually(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.startswith('#'):
                continue
            split_line = line.strip().split()  
            if len(split_line) >= 8:
                try:
                    id_field = split_line[0]
                    synset_name_field = split_line[1]
                    pos_field = split_line[2]
                    gloss_field = ' '.join(split_line[3:-4])
                    prob_past = split_line[-4]
                    prob_present = split_line[-3]
                    prob_future = split_line[-2]
                    prob_atemporal = split_line[-1]
                    row = {
                        "ID": id_field,
                        "Synset_name": synset_name_field,
                        "POS": pos_field,
                        "Synset_gloss": gloss_field,
                        "Prob_of_being_Past": prob_past,
                        "Prob_of_being_Present": prob_present,
                        "Prob_of_being_Future": prob_future,
                        "Prob_of_being_Atemporal": prob_atemporal
                    }
                    data.append(row)
                except Exception as e:
                    print(f"Error processing line: {line}")
                    print(f"Exception: {e}")
    df = pd.DataFrame(data)
    df["Prob_of_being_Past"] = pd.to_numeric(df["Prob_of_being_Past"], errors='coerce')
    df["Prob_of_being_Present"] = pd.to_numeric(df["Prob_of_being_Present"], errors='coerce')
    df["Prob_of_being_Future"] = pd.to_numeric(df["Prob_of_being_Future"], errors='coerce')
    df["Prob_of_being_Atemporal"] = pd.to_numeric(df["Prob_of_being_Atemporal"], errors='coerce')
    return df


# Preprocess the input text
def preprocess_text(text):
    
    # nltk.tokenize, split input text into tokens
    words = word_tokenize(text)
    
    # filter out stopwords from the words list    
    stop_words = set(stopwords.words('english'))
    
    words = [word for word in words if word.lower() not in stop_words]
    # words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words]
    # older version where both lemmatization and stemming was happening
    
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words


# Cosine Similarity Search Function
def cosine_search(word_embedding, threshold=0.9):
    
    results = index.query(word_embedding, top_k=10)
    
    similar_words = []
    
    for match in results['matches']:
        if match['score'] > threshold:
            similar_words.append(match['id'])
            
    return similar_words


# Calculate average temporal probabilities and extract IDs
def calculate_temporal_probabilities(words, df):
    temporal_probs = [] # to store tuples of temporal probabilities for each word
    
    ids = [] # id of each word used to calculate temp.prob
    
    for word in words:
        try:
            # case-sensitive substring search to find the word in the words list
            matches = df[df['Synset_name'].str.contains(word)]
            
            if matches.empty:
                # No exact match, use cosine similarity
                
                word_embedding = get_bert_embedding(word)
                similar_words = cosine_search(word_embedding, threshold=0.9)
                
                if similar_words:
                    for similar_word in similar_words:
                        
                        match = df[df['Word'] == similar_word]
                        if not match.empty:
                            probs = (
                                match.iloc[0]['Prob_of_being_Past'],
                                match.iloc[0]['Prob_of_being_Present'],
                                match.iloc[0]['Prob_of_being_Future'],
                                match.iloc[0]['Prob_of_being_Atemporal']
                            )
                            
                            temporal_probs.append(probs)
                            ids.append(match.iloc[0]['ID'])
                            
            else:
                
                for _, row in matches.iterrows():
                    # tuple creation, and append to main lists
                    probs = (row['Prob_of_being_Past'], row['Prob_of_being_Present'], row['Prob_of_being_Future'], row['Prob_of_being_Atemporal'])
                    temporal_probs.append(probs)
                    ids.append(row['ID'])
                    
        except Exception as e:
            print(f"Error processing word: {word}")
            print(f"Exception: {e}")
    
    # if any probabilites are collected in temporal_probs list
    if temporal_probs:
        # return average probability and the id
        # FUTURE!   consider changing average to weighted mean or any other method 
        avg_probs = np.mean(temporal_probs, axis=0)
        return avg_probs, ids
    else:
        return None, []
    
    # Process the input sentence
def process_sentence(sentence, df):
    
    words = preprocess_text(sentence)
    
    avg_probs, ids = calculate_temporal_probabilities(words, df)
    
    return words, avg_probs, ids

NameError: name 'BertTokenizer' is not defined

In [None]:
# Path to your TempoWordNet file
file_path = './TempoWordNet/TempoWnL_1.0.txt'

tempowordnet_df = read_tempowordnet_manually(file_path)

In [None]:

# # Insert BERT embeddings into Pinecone (once, you can run this once)
#for word in tempowordnet_df['Word']:
#     embedding = get_bert_embedding(word)
#     index.upsert([(word, embedding)])

In [None]:
# Input sentence
sentence = input('Input sentence for processing: ')

# Process the sentence
preprocessed_text, avg_temporal_probs, ids = process_sentence(sentence, tempowordnet_df)

In [None]:
# Append results to a text file
output_file_path = 'processed_results.txt'
with open(output_file_path, 'a', encoding='utf-8') as file:
    file.write(f"\n{'-'*50}\n")
    file.write(f"Original Sentence:\n{sentence}\n\n")
    file.write(f"Preprocessed Text:\n{' '.join(preprocessed_text)}\n\n")
    if avg_temporal_probs is not None:
        file.write(f"{'ID':<15} {'Past':<10} {'Present':<10} {'Future':<10} {'Atemporal':<10}\n")
        file.write(f"{'-'*50}\n")
        for id in ids:
            match = tempowordnet_df[tempowordnet_df['ID'] == id].iloc[0]
            file.write(f"{id:<15} {match['Prob_of_being_Past']:<10.6f} {match['Prob_of_being_Present']:<10.6f} {match['Prob_of_being_Future']:<10.6f} {match['Prob_of_being_Atemporal']:<10.6f}\n")
        file.write(f"\nAverage Temporal Probabilities:\n")
        file.write(f"Past: {avg_temporal_probs[0]:.6f}\n")
        file.write(f"Present: {avg_temporal_probs[1]:.6f}\n")
        file.write(f"Future: {avg_temporal_probs[2]:.6f}\n")
        file.write(f"Atemporal: {avg_temporal_probs[3]:.6f}\n")
    else:
        file.write("No valid words found in TempoWordNet.\n")

print(f"Results have been saved to {output_file_path}.")

Results have been saved to processed_results.txt.
