In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
from nltk.corpus import brown
brown.categories()
news_corpus = brown.sents(categories=['news'])
  

In [3]:
news_corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

## Load data

In [4]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(news_corpus))) #all the words we have in the system - <UNK>

In [5]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['dog']

11947

In [6]:
vocabs.append('<UNK>')
word2index['<UNK>'] = 6

In [7]:
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'on-the-scene'

## Prepare train data for Skipgram

In [8]:
#create pairs of center word, and outside word

def random_batch(batch_size, news_corpus,windows_size=2):

    skipgrams = []

    #loop each corpus
    for doc in news_corpus:
        #look from the 2nd word until second last word
        for i in range(windows_size, len(doc)-windows_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = []
            for j in range(windows_size):
                outside.append(word2index[doc[i-j-1]])
                outside.append(word2index[doc[i+j+1]]) 

            #for each of these two outside words, we gonna append to a list
            for _,each_out in enumerate(outside):
                skipgrams.append([center, each_out])
                    #center, outside1;   center, outside2

                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x, y = random_batch(2, news_corpus,2)

In [9]:
x.shape  #batch_size, 1

(2, 1)

In [10]:
x

array([[7281],
       [4554]])

In [11]:
y.shape  #batch_size 1

(2, 1)

In [12]:
y

array([[ 5059],
       [12593]])

## Negative Sampling

In [13]:
z = 0.001

#count
from collections import Counter

word_count = Counter(flatten(news_corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'the': 114,
         ',': 108,
         '.': 89,
         'of': 69,
         'and': 55,
         'to': 55,
         'a': 52,
         'in': 50,
         'for': 30,
         'that': 26,
         'The': 26,
         'was': 24,
         '``': 24,
         'is': 24,
         "''": 24,
         'on': 22,
         'at': 21,
         'be': 19,
         'with': 19,
         'as': 18,
         'by': 18,
         'he': 17,
         'his': 15,
         'said': 15,
         'will': 15,
         'from': 14,
         'it': 14,
         ';': 13,
         'are': 13,
         'an': 12,
         '--': 12,
         'had': 12,
         'has': 12,
         'have': 11,
         'who': 11,
         'this': 11,
         'Mrs.': 11,
         'were': 11,
         'not': 11,
         'would': 10,
         'which': 10,
         'their': 10,
         'they': 9,
         'He': 9,
         'been': 9,
         '(': 8,
         'Mr.': 8,
         'one': 8,
         'last': 8,
         'but': 8,
         'I': 

## Co-occurence Matrix X

In [14]:
from collections import Counter

X_i = Counter(flatten(news_corpus))

skip_grams = []

windows_size = 2

for doc in news_corpus:
    for i in range(windows_size, len(doc)-windows_size):
        center = doc[i]
        outside = []
        for j in range(windows_size):
            outside.append(doc[i-j-1])
            outside.append(doc[i+j+1]) 
        for each_out in outside:
            skip_grams.append((center, each_out))
skip_grams


[('County', 'Fulton'),
 ('County', 'Grand'),
 ('County', 'The'),
 ('County', 'Jury'),
 ('Grand', 'County'),
 ('Grand', 'Jury'),
 ('Grand', 'Fulton'),
 ('Grand', 'said'),
 ('Jury', 'Grand'),
 ('Jury', 'said'),
 ('Jury', 'County'),
 ('Jury', 'Friday'),
 ('said', 'Jury'),
 ('said', 'Friday'),
 ('said', 'Grand'),
 ('said', 'an'),
 ('Friday', 'said'),
 ('Friday', 'an'),
 ('Friday', 'Jury'),
 ('Friday', 'investigation'),
 ('an', 'Friday'),
 ('an', 'investigation'),
 ('an', 'said'),
 ('an', 'of'),
 ('investigation', 'an'),
 ('investigation', 'of'),
 ('investigation', 'Friday'),
 ('investigation', "Atlanta's"),
 ('of', 'investigation'),
 ('of', "Atlanta's"),
 ('of', 'an'),
 ('of', 'recent'),
 ("Atlanta's", 'of'),
 ("Atlanta's", 'recent'),
 ("Atlanta's", 'investigation'),
 ("Atlanta's", 'primary'),
 ('recent', "Atlanta's"),
 ('recent', 'primary'),
 ('recent', 'of'),
 ('recent', 'election'),
 ('primary', 'recent'),
 ('primary', 'election'),
 ('primary', "Atlanta's"),
 ('primary', 'produced'),
 (

In [15]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('the', 'of'): 1460,
         ('of', 'the'): 1443,
         ('the', ','): 870,
         (',', 'the'): 855,
         (',', ','): 809,
         ('the', 'in'): 663,
         ('in', 'the'): 659,
         (',', 'and'): 624,
         ('and', ','): 624,
         ('the', 'to'): 594,
         ('to', 'the'): 590,
         ('of', ','): 380,
         (',', 'of'): 377,
         ('and', 'the'): 357,
         ('the', '.'): 354,
         ('the', 'and'): 353,
         ('of', 'a'): 351,
         ('a', 'of'): 346,
         (',', 'a'): 324,
         ('a', ','): 319,
         ('the', 'for'): 280,
         (',', 'in'): 280,
         ('in', ','): 280,
         ('for', 'the'): 279,
         ('the', 'on'): 277,
         ('on', 'the'): 274,
         (',', 'said'): 250,
         ('the', 'that'): 224,
         ('that', 'the'): 222,
         ('to', 'a'): 221,
         ('a', 'to'): 219,
         ('the', 'at'): 210,
         (',', "''"): 209,
         ('at', 'the'): 208,
         ("''", ','): 207,
         

## Weight Function

In [16]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## Prepare Data for GloVe

In [17]:
import math

def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [18]:
skip_grams[0]

('County', 'Fulton')

In [19]:
batch_size = 2
x, y, cooc, weighting = random_batch_glove(batch_size, news_corpus, skip_grams, X_ik, weighting_dic)

## Skipgram Model


In [20]:
len(vocabs)

14395

In [21]:
embedding = nn.Embedding(7, 2)

In [22]:
# x_tensor = torch.LongTensor(x)
# embedding(x_tensor).shape  #(batch_size, 1, emb_size)

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

In [23]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
        

In [24]:
#prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 14392, 14393,     6],
        [    0,     1,     2,  ..., 14392, 14393,     6]])

In [25]:
model = Skipgram(voc_size, 2)
model

Skipgram(
  (embedding_center): Embedding(14395, 2)
  (embedding_outside): Embedding(14395, 2)
)

In [26]:
input_tensor = torch.LongTensor(x)
label_tensor = torch.LongTensor(y)

In [27]:
loss = model(input_tensor, label_tensor, all_vocabs)

In [28]:
loss

tensor(12.3457, grad_fn=<NegBackward0>)

## Skipgram Model (Neg Sampling)

In [29]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)


import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

batch_size = 2
x, y = random_batch(batch_size, news_corpus,2)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [30]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

In [31]:
#test your model
emb_size = 2
voc_size = len(vocabs)
model_neg = SkipgramNeg(voc_size, emb_size)

In [32]:
loss = model_neg(x_tensor, y_tensor, neg_samples)

## GloVe Model

In [33]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [34]:
#test our system
voc_size = len(vocabs)
emb_size = 2
model = Glove(voc_size, emb_size)

In [35]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [36]:
loss_glove = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)
loss_glove

tensor(12.7733, grad_fn=<SumBackward0>)

## 4. Training

In [37]:
batch_size = 2
emb_size   = 2
model      = Skipgram(voc_size, emb_size)
optimizer1  = optim.Adam(model.parameters(), lr=0.001)

In [38]:
optimizer2 = optim.Adam(model_neg.parameters(), lr=0.001)

In [39]:
num_epochs = 100

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, news_corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss2 = model_neg(x_tensor, y_tensor, neg_samples)
    loss1 = model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer1.zero_grad()
    loss1.backward()
    optimizer2.zero_grad()
    loss2.backward()
    
    #update alpha
    optimizer1.step()
    optimizer2.step()
    
    #print the loss
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss1:2.6f}")

Epoch     10 | Loss: 9.933471
Epoch     20 | Loss: 11.134831
Epoch     30 | Loss: 9.236036
Epoch     40 | Loss: 9.072711
Epoch     50 | Loss: 10.297705
Epoch     60 | Loss: 10.350468
Epoch     70 | Loss: 13.657674
Epoch     80 | Loss: 10.163830
Epoch     90 | Loss: 9.146379
Epoch    100 | Loss: 9.173416


## 5. Plot the embeddings

Is fruit really near to banana?
Is fruit really far from cat?

In [40]:
vocabs

['employments',
 'kicks',
 'quarter',
 '87th',
 'mourn',
 'on-the-scene',
 'hijacked',
 'knowledge',
 'Fifth',
 'Cauffman',
 'Port',
 'predicted',
 "Skipjack's",
 'yardstick',
 'beer',
 'neatly',
 'onto',
 'Increase',
 'Show',
 'unmeritorious',
 'returned',
 'sounded',
 'Shoettle',
 "University's",
 'Beardens',
 'wrinkles',
 'Pampa',
 'Eaton',
 'shots',
 'monthly',
 'adoption',
 'Sullivan',
 'grammar',
 'barrel',
 'destroy',
 'saute',
 'Leary',
 'Epsilon',
 'La',
 '8,280',
 'fouled',
 'Kililngsworth',
 'one-half',
 'they',
 'junior-senior',
 'comforting',
 'Congolese',
 'Crump',
 'Generale',
 'Decries',
 'Davis',
 '380-foot',
 'Stay',
 'Hilton',
 'Karol',
 'family-community',
 'clustered',
 'letting',
 'embroidered',
 'Hagner',
 'heirs',
 'left-centerfield',
 'Brig.',
 'Developments',
 'executives',
 'Town',
 'humor',
 '65,000',
 'prosecutions',
 "Nischwitz'",
 'lumped',
 'Audrey',
 'baseball',
 'visited',
 'candid',
 'gambling',
 'offers',
 'decadence',
 'start',
 'une',
 'branch',
 '

In [41]:
tourists = torch.LongTensor([word2index['tourists']])
# banana = torch.LongTensor([word2index['banana']])
tourists

tensor([816])

In [42]:
tourists_embed_c = model.embedding_center(tourists)
tourists_embed_o = model.embedding_outside(tourists)
tourists_embed   = (tourists_embed_c + tourists_embed_o) / 2
tourists_embed

tensor([[1.1838, 0.6507]], grad_fn=<DivBackward0>)

In [43]:
tourists_embed_o

tensor([[ 1.2080, -0.3722]], grad_fn=<EmbeddingBackward0>)

In [44]:
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
        
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    return embed[0][0].item(), embed[0][1].item()

In [45]:
# get_embed('fruit')
get_embed('tourists')

(1.18377685546875, 0.6506659984588623)

In [46]:
get_embed('jury')

(0.03496024012565613, -0.4117969870567322)

In [47]:
get_embed('bedroom')

(0.5659295916557312, -0.3466605246067047)

In [48]:
get_embed('Gin')

(1.8028640747070312, -0.5469164252281189)

In [49]:
# x, y = get_embed(word)

In [50]:
# plt.figure(figsize=(6, 3))
# for i, word in enumerate(vocabs):
#     x, y = get_embed(word)
#     plt.scatter(x, y)
#     plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')
# plt.show()

## 6. Cosine similarity

In [51]:
bedroom = get_embed('bedroom')
bedroom

(0.5659295916557312, -0.3466605246067047)

In [52]:
tourists = get_embed('tourists')
tourists

(1.18377685546875, 0.6506659984588623)

In [53]:
jury = get_embed('jury')
jury

(0.03496024012565613, -0.4117969870567322)

In [54]:
np.array(bedroom) @ np.array(jury)

0.16253879398304605

In [55]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

print(cosine_similarity(np.array(tourists), np.array(jury)))
print(cosine_similarity(np.array(tourists), np.array(bedroom)))


-0.4058262912611116
0.49568473649786354


In [56]:
def find_best_similarlity(word,all_vocabs):
    max = -1
    similar_word = ''
    for index,each in enumerate(all_vocabs):
        similar_value = cosine_similarity(np.array(get_embed(word)),np.array(get_embed(each)))
        if similar_value > max:
            max = similar_value
            similar_word = each
    return similar_word

print(find_best_similarlity('Thailand',vocabs))


Thailand


In [57]:
f = open("word_test_semantic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    try:
        count_all+=1
        a_emb = np.array(get_embed(a)).flatten()
        b_emb = np.array(get_embed(b)).flatten()
        c_emb = np.array(get_embed(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()


In [58]:
(count_correct/count_all) * 100

0.0

In [59]:
f = open("word_test_syntactic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    try:
        count_all+=1
        a_emb = np.array(get_embed(a)).flatten()
        b_emb = np.array(get_embed(b)).flatten()
        c_emb = np.array(get_embed(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()


In [60]:
(count_correct/count_all) * 100

0.0

In [61]:
f = open("wordsim_similarity_goldstandard.txt", "r")
human_mean = []
model_similar = []
for line in f:
    words = line.strip().split()
    a, b, c = words
    try:
        model_similar.append(cosine_similarity(np.array(get_embed(a)),np.array(get_embed(b))))
        human_mean.append(c)
    except:
        continue
f.close()




In [62]:
len(human_mean)
len(model_similar)

105

In [63]:
from scipy.stats import spearmanr

res = spearmanr(human_mean, model_similar)
res.statistic


-0.11858594736837548

In [64]:
res.pvalue

0.2282592164969438