In [35]:
import pandas as pd

def peek_file(file_path, num_lines=5):
    with open(file_path, 'r', encoding='utf-8') as file:
        for _ in range(num_lines):
            print(file.readline().strip())

print("File contents:")
peek_file('./TempoWordNet/TempoWnL_1.0.txt')

File contents:
#ID    Name    POS   Gloss  Prob_of_being_Past    Prob_of_being_Present    Prob_of_being_Future    Prob_of_being_Atemporal
1740    able.a.01    a    (usually followed by `to') having the necessary means or skill or know-how or authority to do something    0.000000   0.000238   0.118762   0.881
2098    unable.a.01    a    (usually followed by `to') not having the necessary means or skill or know-how    0.000000   0.001728   0.862272   0.136
2312    abaxial.a.01    a    facing away from the axis of an organ or organism    0.000000   0.149700   0.000300   0.85
2527    adaxial.a.01    a    nearest to or facing toward the axis of an organ or organism    0.000000   0.700596   0.001404   0.298


In [49]:
def read_tempowordnet_manually(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Skip comments
            if line.startswith('#'):
                continue
            
            # Manually split the line
            split_line = line.strip().split()

            # Extract the temporal probability fields from the end
            if len(split_line) >= 8:
                id_field = split_line[0]
                synset_name_field = split_line[1]
                pos_field = split_line[2]
                
                # Gloss is all fields between POS and the first probability field
                gloss_field = ' '.join(split_line[3:-4])
                
                # Temporal probability fields
                prob_past = split_line[-4]
                prob_present = split_line[-3]
                prob_future = split_line[-2]
                prob_atemporal = split_line[-1]
                
                # Construct row data
                row = {
                    "ID": id_field,
                    "Synset_name": synset_name_field,
                    "POS": pos_field,
                    "Synset_gloss": gloss_field,
                    "Prob_of_being_Past": prob_past,
                    "Prob_of_being_Present": prob_present,
                    "Prob_of_being_Future": prob_future,
                    "Prob_of_being_Atemporal": prob_atemporal
                }
                data.append(row)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

tempowordnet_df = read_tempowordnet_manually('./TempoWordNet/TempoWnL_1.0.txt')

print("\nDataFrame Info:")
print(tempowordnet_df.info())
print("\nFirst few rows of the DataFrame:")
print(tempowordnet_df.head())



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117658 entries, 0 to 117657
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   ID                       117658 non-null  object
 1   Synset_name              117658 non-null  object
 2   POS                      117658 non-null  object
 3   Synset_gloss             117658 non-null  object
 4   Prob_of_being_Past       117658 non-null  object
 5   Prob_of_being_Present    117658 non-null  object
 6   Prob_of_being_Future     117658 non-null  object
 7   Prob_of_being_Atemporal  117658 non-null  object
dtypes: object(8)
memory usage: 7.2+ MB
None

First few rows of the DataFrame:
     ID      Synset_name POS  \
0  1740        able.a.01   a   
1  2098      unable.a.01   a   
2  2312     abaxial.a.01   a   
3  2527     adaxial.a.01   a   
4  2730  acroscopic.a.01   a   

                                        Synset_gloss Prob_of_

In [50]:
# Create 'Word' column
tempowordnet_df['Word'] = tempowordnet_df['Synset_name'].str.split('.').str[0]

# Convert probability columns to numeric
prob_columns = ["Prob_of_being_Past", "Prob_of_being_Present", "Prob_of_being_Future", "Prob_of_being_Atemporal"]
for col in prob_columns:
    tempowordnet_df[col] = pd.to_numeric(tempowordnet_df[col], errors='coerce')

print("\nUpdated DataFrame Info:")
print(tempowordnet_df.info())
print("\nFirst few rows of the updated DataFrame:")
print(tempowordnet_df.head())


Updated DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117658 entries, 0 to 117657
Data columns (total 9 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       117658 non-null  object 
 1   Synset_name              117658 non-null  object 
 2   POS                      117658 non-null  object 
 3   Synset_gloss             117658 non-null  object 
 4   Prob_of_being_Past       117658 non-null  float64
 5   Prob_of_being_Present    117658 non-null  float64
 6   Prob_of_being_Future     117658 non-null  float64
 7   Prob_of_being_Atemporal  117658 non-null  float64
 8   Word                     117658 non-null  object 
dtypes: float64(4), object(5)
memory usage: 8.1+ MB
None

First few rows of the updated DataFrame:
     ID      Synset_name POS  \
0  1740        able.a.01   a   
1  2098      unable.a.01   a   
2  2312     abaxial.a.01   a   
3  2527     adaxial.a.01   a   
4

In [51]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    return words

def calculate_temporal_probabilities(words):
    temporal_probs = []
    for word in words:
        try:
            word_probs = tempowordnet_df[tempowordnet_df['Word'].str.lower() == word.lower()].iloc[0]
            probs = (word_probs['Prob_of_being_Past'], word_probs['Prob_of_being_Present'], 
                     word_probs['Prob_of_being_Future'], word_probs['Prob_of_being_Atemporal'])
            temporal_probs.append(probs)
        except IndexError:
            print(f'{word} not found in TWN, hence skipped')
            continue
    
    if temporal_probs:
        avg_probs = np.mean(temporal_probs, axis=0)
        return avg_probs
    else:
        return None

# Test the functions
test_sentence = "Unprocessed lifestyle is the best dead"
preprocessed_words = preprocess_text(test_sentence)
print("Preprocessed words:", preprocessed_words)

avg_temporal_probs = calculate_temporal_probabilities(preprocessed_words)
if avg_temporal_probs is not None:
    print(f"Average Temporal Probabilities:\nPast: {avg_temporal_probs[0]:.4f}, Present: {avg_temporal_probs[1]:.4f}, Future: {avg_temporal_probs[2]:.4f}, Atemporal: {avg_temporal_probs[3]:.4f}")
else:
    print("No valid words found in TempoWordNet.")

Preprocessed words: ['unprocessed', 'lifestyle', 'best', 'dead']
lifestyle not found in TWN, hence skipped
Average Temporal Probabilities:
Past: 0.0000, Present: 0.3423, Future: 0.0007, Atemporal: 0.6570
