In [1]:
import pandas as pd

def peek_file(file_path, num_lines=5):
    with open(file_path, 'r', encoding='utf-8') as file:
        for _ in range(num_lines):
            print(file.readline().strip())

print("File contents:")
peek_file('./TempoWordNet/TempoWnL_1.0.txt')

File contents:
#ID    Name    POS   Gloss  Prob_of_being_Past    Prob_of_being_Present    Prob_of_being_Future    Prob_of_being_Atemporal
1740    able.a.01    a    (usually followed by `to') having the necessary means or skill or know-how or authority to do something    0.000000   0.000238   0.118762   0.881
2098    unable.a.01    a    (usually followed by `to') not having the necessary means or skill or know-how    0.000000   0.001728   0.862272   0.136
2312    abaxial.a.01    a    facing away from the axis of an organ or organism    0.000000   0.149700   0.000300   0.85
2527    adaxial.a.01    a    nearest to or facing toward the axis of an organ or organism    0.000000   0.700596   0.001404   0.298


In [2]:
def read_tempowordnet_manually(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Skip comments
            if line.startswith('#'):
                continue
            
            # Manually split the line
            split_line = line.strip().split()

            # Extract the temporal probability fields from the end
            if len(split_line) >= 8:
                id_field = split_line[0]
                synset_name_field = split_line[1]
                pos_field = split_line[2]
                
                # Gloss is all fields between POS and the first probability field
                gloss_field = ' '.join(split_line[3:-4])
                
                # Temporal probability fields
                prob_past = split_line[-4]
                prob_present = split_line[-3]
                prob_future = split_line[-2]
                prob_atemporal = split_line[-1]
                
                # Construct row data
                row = {
                    "ID": id_field,
                    "Synset_name": synset_name_field,
                    "POS": pos_field,
                    "Synset_gloss": gloss_field,
                    "Prob_of_being_Past": prob_past,
                    "Prob_of_being_Present": prob_present,
                    "Prob_of_being_Future": prob_future,
                    "Prob_of_being_Atemporal": prob_atemporal
                }
                data.append(row)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

tempowordnet_df = read_tempowordnet_manually('./TempoWordNet/TempoWnL_1.0.txt')

print("\nDataFrame Info:")
print(tempowordnet_df.info())
print("\nFirst few rows of the DataFrame:")
print(tempowordnet_df.head())



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117658 entries, 0 to 117657
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   ID                       117658 non-null  object
 1   Synset_name              117658 non-null  object
 2   POS                      117658 non-null  object
 3   Synset_gloss             117658 non-null  object
 4   Prob_of_being_Past       117658 non-null  object
 5   Prob_of_being_Present    117658 non-null  object
 6   Prob_of_being_Future     117658 non-null  object
 7   Prob_of_being_Atemporal  117658 non-null  object
dtypes: object(8)
memory usage: 7.2+ MB
None

First few rows of the DataFrame:
     ID      Synset_name POS  \
0  1740        able.a.01   a   
1  2098      unable.a.01   a   
2  2312     abaxial.a.01   a   
3  2527     adaxial.a.01   a   
4  2730  acroscopic.a.01   a   

                                        Synset_gloss Prob_of_

In [3]:
# Create 'Word' column
tempowordnet_df['Word'] = tempowordnet_df['Synset_name'].str.split('.').str[0]

# Convert probability columns to numeric
prob_columns = ["Prob_of_being_Past", "Prob_of_being_Present", "Prob_of_being_Future", "Prob_of_being_Atemporal"]
for col in prob_columns:
    tempowordnet_df[col] = pd.to_numeric(tempowordnet_df[col], errors='coerce')

print("\nUpdated DataFrame Info:")
print(tempowordnet_df.info())
print("\nFirst few rows of the updated DataFrame:")
print(tempowordnet_df.head())


Updated DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117658 entries, 0 to 117657
Data columns (total 9 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       117658 non-null  object 
 1   Synset_name              117658 non-null  object 
 2   POS                      117658 non-null  object 
 3   Synset_gloss             117658 non-null  object 
 4   Prob_of_being_Past       117658 non-null  float64
 5   Prob_of_being_Present    117658 non-null  float64
 6   Prob_of_being_Future     117658 non-null  float64
 7   Prob_of_being_Atemporal  117658 non-null  float64
 8   Word                     117658 non-null  object 
dtypes: float64(4), object(5)
memory usage: 8.1+ MB
None

First few rows of the updated DataFrame:
     ID      Synset_name POS  \
0  1740        able.a.01   a   
1  2098      unable.a.01   a   
2  2312     abaxial.a.01   a   
3  2527     adaxial.a.01   a   
4

In [4]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Function to read TempoWordNet manually
def read_tempowordnet_manually(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.startswith('#'):
                continue
            split_line = line.strip().split()
            
            # replace space in .txt file with \t - regex    
            if len(split_line) >= 8:
                try:
                    id_field = split_line[0]
                    synset_name_field = split_line[1]
                    pos_field = split_line[2]
                    gloss_field = ' '.join(split_line[3:-4])
                    prob_past = split_line[-4]
                    prob_present = split_line[-3]
                    prob_future = split_line[-2]
                    prob_atemporal = split_line[-1]
                    row = {
                        "ID": id_field,
                        "Synset_name": synset_name_field,
                        "POS": pos_field,
                        "Synset_gloss": gloss_field,
                        "Prob_of_being_Past": prob_past,
                        "Prob_of_being_Present": prob_present,
                        "Prob_of_being_Future": prob_future,
                        "Prob_of_being_Atemporal": prob_atemporal
                    }
                    data.append(row)
                except Exception as e:
                    print(f"Error processing line: {line}")
                    print(f"Exception: {e}")
    df = pd.DataFrame(data)
    df["Prob_of_being_Past"] = pd.to_numeric(df["Prob_of_being_Past"], errors='coerce')
    df["Prob_of_being_Present"] = pd.to_numeric(df["Prob_of_being_Present"], errors='coerce')
    df["Prob_of_being_Future"] = pd.to_numeric(df["Prob_of_being_Future"], errors='coerce')
    df["Prob_of_being_Atemporal"] = pd.to_numeric(df["Prob_of_being_Atemporal"], errors='coerce')
    return df

# Preprocess the input text
def preprocess_text(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words]
    return words

# Calculate average temporal probabilities and extract IDs
def calculate_temporal_probabilities(words, df):
    temporal_probs = []
    ids = []
    for word in words:
        try:
            matches = df[df['Synset_name'].str.contains(word)]
            if not matches.empty:
                for _, row in matches.iterrows():
                    probs = (row['Prob_of_being_Past'], row['Prob_of_being_Present'], row['Prob_of_being_Future'], row['Prob_of_being_Atemporal'])
                    temporal_probs.append(probs)
                    ids.append(row['ID'])
        except Exception as e:
            print(f"Error processing word: {word}")
            print(f"Exception: {e}")
    
    if temporal_probs:
        avg_probs = np.mean(temporal_probs, axis=0)
        return avg_probs, ids
    else:
        return None, []

# Process the input sentence
def process_sentence(sentence, df):
    words = preprocess_text(sentence)
    avg_probs, ids = calculate_temporal_probabilities(words, df)
    return words, avg_probs, ids

# Path to your TempoWordNet file
file_path = './TempoWordNet/TempoWnL_1.0.txt'
tempowordnet_df = read_tempowordnet_manually(file_path)

# Input sentence
sentence = input('Input sentence for processing: ')

# Process the sentence
preprocessed_text, avg_temporal_probs, ids = process_sentence(sentence, tempowordnet_df)

# Append results to a text file
output_file_path = 'processed_results.txt'
with open(output_file_path, 'a', encoding='utf-8') as file:
    file.write(f"\n{'-'*50}\n")
    file.write(f"Original Sentence:\n{sentence}\n\n")
    file.write(f"Preprocessed Text:\n{' '.join(preprocessed_text)}\n\n")
    if avg_temporal_probs is not None:
        file.write(f"{'ID':<15} {'Past':<10} {'Present':<10} {'Future':<10} {'Atemporal':<10}\n")
        file.write(f"{'-'*50}\n")
        for id in ids:
            match = tempowordnet_df[tempowordnet_df['ID'] == id].iloc[0]
            file.write(f"{id:<15} {match['Prob_of_being_Past']:<10.6f} {match['Prob_of_being_Present']:<10.6f} {match['Prob_of_being_Future']:<10.6f} {match['Prob_of_being_Atemporal']:<10.6f}\n")
        file.write(f"\nAverage Temporal Probabilities:\n")
        file.write(f"Past: {avg_temporal_probs[0]:.6f}\n")
        file.write(f"Present: {avg_temporal_probs[1]:.6f}\n")
        file.write(f"Future: {avg_temporal_probs[2]:.6f}\n")
        file.write(f"Atemporal: {avg_temporal_probs[3]:.6f}\n")
    else:
        file.write("No valid words found in TempoWordNet.\n")

print(f"Results have been saved to {output_file_path}.")


Results have been saved to processed_results.txt.
