In [1]:
%run transition_graph.ipynb
import numpy as np

In [13]:
word_embeddings = {}

with open("W2V-Kin-50.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.strip().split()
        if not values:
            continue

        word = values[0]  

        try:
            vector = np.array(values[1:], dtype=np.float32)
            word_embeddings[word] = vector
        except ValueError:
            continue

print(f"loaded {len(word_embeddings)} word embeddings!")



loaded 54283 word embeddings!


In [14]:
def find_valid_inflections(stem:str, emdeddings: dict, inflection_list:list) ->list:
    valid_inflection = [infection for inflection in inflection_list if inflection in embeddings]
    return valid_inflection 


In [18]:
def sigmoid(z: float, min_f: float, max_f: float) -> float:
    if max_f - min_f == 0:
        return 0 
    return 1 / (1 + np.exp(-8 * ((z - min_f) / (max_f - min_f))))


In [22]:
def compute_cosine_similarity(x:str, y:str, embeddings:dict, Minf:float, Maxf:float)->float:
    if x not in embeddings or y not in embeddings:
        return 0.0
    x_vec, y_vec = embeddings[x], embeddings[y]
    cosine_similarity = np.dot(x_vec, y_vec)/ (np.linalg.norm(x_vec)*np.linalg.norm(y_vec))
    cosine_similarity = np.clip(cosine_similarity, -1.0, 1.0)
    angular_normalized = (1) - (1/np.pi) * np.arccos(cosine_similarity)


    normalized_similarity = sigmoid(angular_normalized, Minf, Maxf)

    return normalized_similarity

In [23]:
def compute_eculidean_distance(x:str, y:str, token_count:dict, doc_count:dict, Minf:float, Maxf:float)->float:
    tc_x , tc_y = token_count.get(x, 0), token_count.get(y, 0)
    td_x , td_y = doc_count.get(x, 0), doc_count.get(y, 0)

    euclidean_distance = np.sqrt((tc_x - tc_y)**2 + (td_x + td_y)**2)
    return sigmoid(euclidean_distance, Minf, Maxf)

In [28]:
def compute_similarity_features(x:str, stem:str, embeddigns:dict,token_count:dict, 
                                doc_count:int,Minf:float, Maxf, k:int)->dict:
    features = {}
    valid_inflections = list(graph.get_inflection(stem))
    valid_inflections = [inflection for inflection in valid_inflections if inflection in embeddings]

    if not valid_inflections:
        return features 
    similarities = [compute_cosine_similarity(x, inflection, embeddings, Minf, Maxf) for inflection in valid_inflections]

    top_k_similarities = sorted(similarities)[:k]

    #computing mean feature
    features["cosine_mean"] = np.mean(top_k_similarities) if top_k_similarities else 0.0
    features["cosine_harmonic_mean"] = len(top_k_similarities)/ np.sum([1.0/s for s in top_k_similarities if s!=0]) if top_k_similarities else 0.0
    features["cosine_geometric_mean"] = np.exp(np.mean(np.log(top_k_similarities))) if all (s>0 for s in top_k_similarities) else 0.0

    #compute eucledian distance feature
    euclidean_score = [compute_euclidean_distance(x, inflection,token_count,doc_count,Minf, Maxf) for inflection in valid_inflections]
    top_euclidean_score = sorted(euclidean_score)[:k]
    features["euclidean_mean"] = np.mean(top_euclidean_score) if top_euclidean_score else 0.0

    # frequency based features 
    tc_x, td_x = token_count.get(x, 0), doc_count.get(x,0)
    avg_freq = (tc_x + td_x)/2 
    avg_freq_inflections = [((token_count.get(infl, 0)+ doc_count.get(infl,0))/2) for infl in valid_inflections]
    features["freq_mean_inflections"] = np.mean(avg_freq_inflections) if avg_freq_inflections else 0.0
    features["freq_mean_x"] = avg_freq_x

    return features 
    