In [40]:
import numpy as np
from w2v_utils import *

Using TensorFlow backend.


In [41]:
words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [42]:
def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similariy between u and v
        
    Arguments:
        u -- a word vector of shape (n,)          
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """
    
    distance = 0.0
    
    ### START CODE HERE ###
    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u,v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(u**2))
    
    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(v**2))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)
    ### END CODE HERE ###
    
    return cosine_similarity

In [43]:
def read_trigger_dic(file ):
    """
    read trigger dictionary 
    
    Arguments:
        file -str : address of trigger dictionary
    
    Returns:
        triggerDic -dicitonary : trigger dictionary with block name as key word 
        
    """

            
    
    triggerDic = {}
    with open(file , "r" , encoding = "utf-8")as f:
        lines = f.readlines()
        verbPattern =False
        blockWords = []
        blockHead = False
        for line in lines :
            if not verbPattern  :
                if line.startswith("####### VERB PATTERNS #######"):
                    verbPattern = True
                else : 
                    continue

            if line.startswith("---"):
                if blockHead and blockWords:
                    triggerDic[blockHead] = blockWords
                blockHead = line;
                blockWords = []
            elif line.startswith("-") or line.startswith("/n") or len(line)<=1 or "+" in line:
                continue
            else :
                if "{"  in line :
                    line = line.split("{")[0].strip()
                elif "#" in line : 
                    line = line.split("#")[0].strip()
                elif "[" in line :
                     line = line.split("[")[0].strip()
                else : 
                    line = line.strip()
                blockWords.append(line.lower())    
                
                
    return triggerDic
                
        

In [44]:
triggerDic = read_trigger_dic(".\CAMEO.txt")

In [45]:
def block_similarity(triggerDic,word_to_vec_map):
    """
    compute trigger block similarity
    
    Arguments:
        triggerDic -dic : trigger word info , key as the blockhead while value as the list of verb 
        word_to_vec_map -dic : 
    Return:
        blockSimDic - dic : {MaxSim:MaxListInfo , MinSim:MinListInfo , AverageSim:AverageInfo , Details : detailInfo}
        wordPairDic - dic : {MaxSim:MaxListInfo , MinSim:MinListInfo , AverageSim:AverageInfo , Details : detailInfo}
    """
    blockSimDic = {}
    wordPairDic = {}
    wordPairMax = -1
    wordPairMin = 1
    wordPairList = []
    blockMax =-1
    blockMin = 1 
    blockList = []
    for key in triggerDic.keys():
        simList = []
        verbList = triggerDic[key]
        verbList = list(set(verbList))
        if len(verbList) > 1 :
            for i in range(len(verbList)):
                    for j in range(i+1 , len(verbList)):
                        if verbList[i] in word_to_vec_map and verbList[j] in word_to_vec_map:
                            wordPairSim = cosine_similarity(word_to_vec_map[verbList[i]] , word_to_vec_map[verbList[j]]) 
                            wordPairList.append(wordPairSim)
                            simList.append(wordPairSim)
                            if wordPairSim > wordPairMax:
                                wordPairMax = wordPairSim
                                wordPairMaxInfo = {"blockkey" : key , "WordPair" : [verbList[i] , verbList[j]] , "sim" : wordPairMax}
                            if wordPairSim < wordPairMin:
                                wordPairMin = wordPairSim
                                wordPairMinInfo = {"blockkey" : key , "WordPair" : [verbList[i] , verbList[j]] ,"sim" : wordPairMin } 
                        else:
                            continue

        else:
            continue
        if simList:
            
            blockAve = np.average(np.array(simList)) 
            blockList.append(blockAve)
        
        if blockAve > blockMax:
            blockMax = blockAve
            blockMaxInfo =  {"blockkey" : key ,  "sim" : blockMax}
        if blockAve < blockMin:
            blockMin = blockAve
            blockMinInfo =  {"blockkey" : key ,  "sim" : blockMin}    

    wordPairSim = {"MaxSim" : wordPairMaxInfo , "MinSim" :wordPairMinInfo ,"AverageSim": np.average(np.array(wordPairList)) ,"Details" : wordPairList  }
    blockSim = {"MaxSim" : blockMaxInfo , "MinSim" :blockMinInfo ,"AverageSim": np.average(np.array(blockList)) ,"Details" : blockList  }
    
        
                    
    return   wordPairSim , blockSim         

In [46]:
wordPairSim , blockSim = block_similarity(triggerDic , word_to_vec_map) 
print(wordPairSim)
print(blockSim)

{'MaxSim': {'blockkey': '---  DELAY   [120]---  ---\n', 'WordPair': ['impede', 'hinder'], 'sim': 0.95998263672691753}, 'MinSim': {'blockkey': '---  DISTURB   [140]  ---\n', 'WordPair': ['discomfit', 'harry'], 'sim': -0.45673986571055397}, 'AverageSim': 0.35634237480152453, 'Details': [0.25315941811678566, 0.21560578392551882, 0.62307556887426863, -0.032573333618307797, 0.36715186237357583, -0.03514223765428641, -0.044439623417829817, 0.44414550179317452, 0.36971731260375185, 0.29821481210402151, 0.62979797961559947, 0.51443586604855462, 0.37761739456919824, 0.56153012623862009, 0.46087532443191648, 0.37788353181350948, 0.68694471381646716, 0.65107106397117465, 0.16310593872764928, 0.53300699997765522, 0.12944158674933481, 0.25548482089550234, 0.14008311357776734, 0.41493217355102174, 0.3500455482629094, 0.24307812896726, 0.17539833714300812, 0.68426655449257967, 0.57853526131374944, 0.70130509555730136, 0.57929444716753864, 0.35175241762001275, 0.70583245725045229, 0.53858745025656796,

In [50]:
import numpy as np
def getSimDistribution(simList):
    simList = np.array(simList)*10
    simMax = np.floor(np.max(simList))
    simMin = np.floor(np.min(simList))
    simList = np.sort(simList)
    Distribution = {}
    for i in range(int(simMin),int(simMax)+1 ):
        count = 0
        for sim in simList :
            if i<=sim and i+1 > sim :    
                count  = count +1   
        if count > 0:    
            Distribution[str(i/10)+"_"+str((i+1)/10)] = count        
                
    return Distribution
    
    
    

In [53]:
simList = [-0.746, 4.6, 9.4, 7.447, 10.455, 11.555]
simList = blockSim["Details"]
print(len(simList))
print(getSimDistribution(simList))

simList = wordPairSim["Details"]
print(len(simList))
print(getSimDistribution(simList))


262
{'-0.1_0.0': 2, '0.0_0.1': 4, '0.1_0.2': 10, '0.2_0.3': 42, '0.3_0.4': 70, '0.4_0.5': 57, '0.5_0.6': 35, '0.6_0.7': 16, '0.7_0.8': 15, '0.8_0.9': 9, '0.9_1.0': 2}
6605
{'-0.5_-0.4': 2, '-0.4_-0.3': 14, '-0.3_-0.2': 62, '-0.2_-0.1': 141, '-0.1_0.0': 300, '0.0_0.1': 449, '0.1_0.2': 660, '0.2_0.3': 934, '0.3_0.4': 1099, '0.4_0.5': 1017, '0.5_0.6': 925, '0.6_0.7': 600, '0.7_0.8': 282, '0.8_0.9': 109, '0.9_1.0': 11}
