Codes are adapted from https://github.com/Garrafao/TRIPY

In [1]:
import pickle
from scipy.sparse import csr_matrix, load_npz, save_npz, linalg
from scipy.spatial.distance import cosine as cosine_distance
#from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.word2vec import PathLineSentences
from collections import defaultdict
from scipy.sparse import dok_matrix
from sklearn.random_projection import sparse_random_matrix
import numpy as np
import pandas as pd
import logging
import json
import os
import re
#from utils_ import Space
#import vocab
#import count
#import random
#import multiply

In [2]:
class Space(object):
    """
    Load and save Space objects.
    """
        
    def __init__(self, path=None, matrix=csr_matrix([]), rows=[], columns=[], format='npz'):
        """
        Can be either initialized (i) by providing a path, (ii) by providing a matrix, rows and columns, or (iii) by providing neither, then an empty instance is created
        `path` should be path to a matrix in npz format, expects rows and columns in same folder at '[path]_rows' and '[path]_columns'
        `rows` list with row names
        `columns` list with column names
        `format` format of matrix, can be either of 'npz' or 'w2v'
        """
        
        if path!=None:
            if format=='npz':
                # Load matrix
                matrix = load_npz(path)
                # Load rows
                with open(path + '_rows', 'rb') as f:
                    rows = pickle.load(f)
                # Load columns
                with open(path + '_columns', 'rb') as f:
                    columns = pickle.load(f)
            elif format=='w2v':
                matrix_array = np.loadtxt(path, dtype=object, delimiter=' ', skiprows=1, encoding='utf-8')
                matrix = matrix_array[:,1:].astype(np.float)
                rows = list(matrix_array[:,0].flatten())
                columns = []             
            else:      
                message = "Matrix format {0} unknown."
                logging.error(message.format(format))

        row2id = {r:i for i, r in enumerate(rows)}
        id2row = {i:r for i, r in enumerate(rows)}
        column2id = {c:i for i, c in enumerate(columns)}
        id2column = {i:c for i, c in enumerate(columns)}

        self.matrix = csr_matrix(matrix)
        self.rows = rows
        self.columns = columns
        self.row2id = row2id
        self.id2row = id2row
        self.column2id = column2id
        self.id2column = id2column      
        
    def save(self, path, format='npz'):
        """
        `path` saves matrix at path in npz format, saves rows and columns as pickled lists in same folder at '[path]_rows' and '[path]_columns'
        `format` format of matrix, can be either of 'npz' or 'w2v'
        """
        
        if format=='npz':       
            # Save matrix
            with open(path, 'wb') as f:
                save_npz(f, self.matrix)    
            # Save rows
            with open(path + '_rows', 'wb') as f:
                pickle.dump(self.rows, f)
            # Save columns
            with open(path + '_columns', 'wb') as f:
                pickle.dump(self.columns, f)
        elif format=='w2v':
            matrix = self.matrix.toarray().astype(object)
            rows = np.array(self.rows)
            r, d = matrix.shape
            rows = rows.reshape(-1,1)
            matrix = np.concatenate((rows, matrix), axis=1)
            np.savetxt(path, matrix, fmt=["%s"] + ['%.16g',]*d, delimiter=' ', newline='\n', header='%d %d' %(r, d), comments='', encoding='utf-8')
        else:      
            message = "Matrix format {0} unknown."
            logging.error(message.format(format))

    def l2_normalize(self):
        '''
        L2-normalize all vectors in the matrix.
        '''
        l2norm = linalg.norm(self.matrix, axis=1, ord=2)
        l2norm[l2norm==0.0] = 1.0 # Convert 0 values to 1
        self.matrix = csr_matrix(self.matrix/l2norm.reshape(len(l2norm),1))

    def mean_center(self):
        '''
        Mean center all columns in the matrix.
        '''
        avg = np.mean(self.matrix, axis = 0)
        self.matrix = csr_matrix(self.matrix - avg)

In [3]:
## Get vocabulary
sentences = PathLineSentences("../data_week_text/russia/covid_russia_week11_2020-4-3.txt")
vocabulary = sorted(list(set([word for sentence in sentences for word in sentence if len(sentence)>1]))) # Skip one-word sentences to avoid zero-vectors
       

In [3]:
## Count occurance with a window
def get_count_matrix(sentences, vocabulary, windowSize = 7): 
    w2i = {w: i for i, w in enumerate(vocabulary)}

    # Initialize co-occurrence matrix as dictionary
    cooc_mat = defaultdict(lambda: 0)

    # Get counts from corpus
    #logging.info("Counting context words")
    #sentences = PathLineSentences(corpDir)
    for sentence in sentences:
        for i, word in enumerate(sentence):
            try:
                windex = w2i[word]
            except KeyError:
                continue
            lowerWindowSize = max(i-windowSize, 0)
            upperWindowSize = min(i+windowSize, len(sentence))
            window = sentence[lowerWindowSize:i] + sentence[i+1:upperWindowSize+1]
            if len(window)==0: # Skip one-word sentences
                continue
            for contextWord in window:
                try:
                    cindex = w2i[contextWord]
                except KeyError:
                    continue                
                cooc_mat[(windex,cindex)] += 1


    # Convert dictionary to sparse matrix
    cooc_mat_sparse = dok_matrix((len(vocabulary),len(vocabulary)), dtype=float)
    try:
        cooc_mat_sparse.update(cooc_mat)
    except NotImplementedError:
        cooc_mat_sparse._update(cooc_mat)

    countSpace = Space(matrix=cooc_mat_sparse, rows=vocabulary, columns=vocabulary)
    countMatrix = countSpace.matrix
    return(countMatrix)

In [4]:
## Generate random matrix
def get_random_matrix(vocabulary, dim = 100):
    #dim: dimention of word embedding
    randomMatrix = sparse_random_matrix(dim,len(vocabulary)).toarray().T
    randomMatrix = Space(matrix=randomMatrix, rows=vocabulary, columns=[])
    randomMatrix = randomMatrix.matrix
    return(randomMatrix)

In [22]:
len()

16322

In [23]:
len(vocabulary)

16322

In [9]:
## Get embedding
def get_embedding_space(countMatrix, randomMatrix, l2 = True, mc = True):
    embedding = np.dot(countMatrix,randomMatrix)    
    embeddingSpace = Space(matrix=embedding, rows=vocabulary, columns=[]) # rows = countSpace.rows
    if l2: 
        embeddingSpace.l2_normalize()
    if mc:
        embeddingSpace.mean_center()
    return(embeddingSpace)

In [14]:
## Get similarity
def get_simi(embeddingSpace, wd1, wd2, negative = False):
    matrix = embeddingSpace.matrix
    row2id = embeddingSpace.row2id
    try:
        v1 = matrix[row2id[wd1]].toarray().flatten()
        v2 = matrix[row2id[wd2]].toarray().flatten()
        if negative:
            simi_score = 1-cosine_distance(v1, v2)
        else:
            simi_score = 1-(cosine_distance(v1, v2)/2)
    except KeyError:
        simi_score = 9
    
    return(simi_score)

In [12]:

countMatrix = get_count_matrix(sentences = sentences, vocabulary = vocabulary, windowSize = 7)
randomMatrix = get_random_matrix(dim = 100, vocabulary = vocabulary)
embeddingSpace = get_embedding_space(countMatrix = countMatrix, randomMatrix = randomMatrix, l2 = True, mc = True)
simi_score = get_simi(embeddingSpace = embeddingSpace, wd1 = "covid19", wd2 = "sdifjsiodjfiosdj")
simi_score

9

In [11]:
## load all keywords pairs
with open('../data/all_keywords_pairs.json', 'r') as json_file:
    all_pairs = json.load(json_file)

## Russian Data

In [16]:
files = os.listdir('../data_week_text/russia')
files.sort()
files

['covid_russia_week01_2020-01-24.txt',
 'covid_russia_week02_2020-01-31.txt',
 'covid_russia_week03_2020-02-07.txt',
 'covid_russia_week04_2020-02-14.txt',
 'covid_russia_week05_2020-02-21.txt',
 'covid_russia_week06_2020-02-28.txt',
 'covid_russia_week07_2020-03-06.txt',
 'covid_russia_week08_2020-03-13.txt',
 'covid_russia_week09_2020-03-20.txt',
 'covid_russia_week10_2020-03-27.txt',
 'covid_russia_week11_2020-04-03.txt',
 'covid_russia_week12_2020-04-10.txt',
 'covid_russia_week13_2020-04-17.txt',
 'covid_russia_week14_2020-04-24.txt',
 'covid_russia_week15_2020-05-01.txt',
 'covid_russia_week16_2020-05-08.txt',
 'covid_russia_week17_2020-05-15.txt',
 'covid_russia_week18_2020-05-22.txt',
 'covid_russia_week19_2020-05-29.txt',
 'covid_russia_week20_2020-06-05.txt',
 'covid_russia_week21_2020-06-12.txt',
 'covid_russia_week22_2020-06-19.txt',
 'covid_russia_week23_2020-06-26.txt',
 'covid_russia_week24_2020-07-03.txt',
 'covid_russia_week25_2020-07-10.txt',
 'covid_russia_week26_202

In [17]:
## generate embedding spaces with all data
results = []
for file in files:
    print(file)
    ## RI embedding
    sentences = PathLineSentences('../data_week_text/russia/'+file)
    vocabulary = sorted(list(set([word for sentence in sentences for word in sentence if len(sentence)>1]))) # Skip one-word sentences to avoid zero-vectors
    countMatrix = get_count_matrix(sentences = sentences, vocabulary = vocabulary, windowSize = 7)
    randomMatrix = get_random_matrix(dim = 100, vocabulary = vocabulary)
    embeddingSpace = get_embedding_space(countMatrix = countMatrix, randomMatrix = randomMatrix, l2 = True, mc = True)
    embeddingSpace.save('../embedding/RI/russia/'+re.sub("\.txt", "",file)+'.npz') # save embedding
    ## Calculate similarity
    sm = []
    for pair in all_pairs:
        #print(all_pairs[j])
        sm.append(get_simi(embeddingSpace = embeddingSpace, wd1 = pair[0], wd2 = pair[1]))
    results.append(sm) 

covid_russia_week01_2020-01-24.txt
covid_russia_week02_2020-01-31.txt
covid_russia_week03_2020-02-07.txt
covid_russia_week04_2020-02-14.txt
covid_russia_week05_2020-02-21.txt
covid_russia_week06_2020-02-28.txt
covid_russia_week07_2020-03-06.txt
covid_russia_week08_2020-03-13.txt
covid_russia_week09_2020-03-20.txt
covid_russia_week10_2020-03-27.txt
covid_russia_week11_2020-04-03.txt
covid_russia_week12_2020-04-10.txt
covid_russia_week13_2020-04-17.txt
covid_russia_week14_2020-04-24.txt
covid_russia_week15_2020-05-01.txt
covid_russia_week16_2020-05-08.txt
covid_russia_week17_2020-05-15.txt
covid_russia_week18_2020-05-22.txt
covid_russia_week19_2020-05-29.txt
covid_russia_week20_2020-06-05.txt
covid_russia_week21_2020-06-12.txt
covid_russia_week22_2020-06-19.txt
covid_russia_week23_2020-06-26.txt
covid_russia_week24_2020-07-03.txt
covid_russia_week25_2020-07-10.txt
covid_russia_week26_2020-07-17.txt
covid_russia_week27_2020-07-24.txt
covid_russia_week28_2020-07-31.txt
covid_russia_week29_

In [18]:
rs_df = pd.DataFrame(results)
pair_names = pd.DataFrame(all_pairs)
rs_df2 = pd.concat([pair_names, rs_df.T], axis=1)
rs_df2.to_csv("../results/simi_russia01-52_RI_non-neg.csv", index=False)

## USA data

In [19]:
files = os.listdir('../data_week_text/usa')
files.sort()
files

['covid_usa_week01_2020-01-24.txt',
 'covid_usa_week02_2020-01-31.txt',
 'covid_usa_week03_2020-02-07.txt',
 'covid_usa_week04_2020-02-14.txt',
 'covid_usa_week05_2020-02-21.txt',
 'covid_usa_week06_2020-02-28.txt',
 'covid_usa_week07_2020-03-06.txt',
 'covid_usa_week08_2020-03-13.txt',
 'covid_usa_week09_2020-03-20.txt',
 'covid_usa_week10_2020-03-27.txt',
 'covid_usa_week11_2020-04-03.txt',
 'covid_usa_week12_2020-04-10.txt',
 'covid_usa_week13_2020-04-17.txt',
 'covid_usa_week14_2020-04-24.txt',
 'covid_usa_week15_2020-05-01.txt',
 'covid_usa_week16_2020-05-08.txt',
 'covid_usa_week17_2020-05-15.txt',
 'covid_usa_week18_2020-05-22.txt',
 'covid_usa_week19_2020-05-29.txt',
 'covid_usa_week20_2020-06-05.txt',
 'covid_usa_week21_2020-06-12.txt',
 'covid_usa_week22_2020-06-19.txt',
 'covid_usa_week23_2020-06-26.txt',
 'covid_usa_week24_2020-07-03.txt',
 'covid_usa_week25_2020-07-10.txt',
 'covid_usa_week26_2020-07-17.txt',
 'covid_usa_week27_2020-07-24.txt',
 'covid_usa_week28_2020-07-3

In [20]:
## generate embedding spaces with all data
results = []
for file in files:
    print(file)
    ## RI embedding
    sentences = PathLineSentences('../data_week_text/usa/'+file)
    vocabulary = sorted(list(set([word for sentence in sentences for word in sentence if len(sentence)>1]))) # Skip one-word sentences to avoid zero-vectors
    countMatrix = get_count_matrix(sentences = sentences, vocabulary = vocabulary, windowSize = 7)
    randomMatrix = get_random_matrix(dim = 100, vocabulary = vocabulary)
    embeddingSpace = get_embedding_space(countMatrix = countMatrix, randomMatrix = randomMatrix, l2 = True, mc = True)
    embeddingSpace.save('../embedding/RI/usa/'+re.sub("\.txt", "",file)+'.npz') # save embedding
    ## Calculate similarity
    sm = []
    for pair in all_pairs:
        #print(all_pairs[j])
        sm.append(get_simi(embeddingSpace = embeddingSpace, wd1 = pair[0], wd2 = pair[1]))
    results.append(sm) 

covid_usa_week01_2020-01-24.txt
covid_usa_week02_2020-01-31.txt
covid_usa_week03_2020-02-07.txt
covid_usa_week04_2020-02-14.txt
covid_usa_week05_2020-02-21.txt
covid_usa_week06_2020-02-28.txt
covid_usa_week07_2020-03-06.txt
covid_usa_week08_2020-03-13.txt
covid_usa_week09_2020-03-20.txt
covid_usa_week10_2020-03-27.txt
covid_usa_week11_2020-04-03.txt
covid_usa_week12_2020-04-10.txt
covid_usa_week13_2020-04-17.txt
covid_usa_week14_2020-04-24.txt
covid_usa_week15_2020-05-01.txt
covid_usa_week16_2020-05-08.txt
covid_usa_week17_2020-05-15.txt
covid_usa_week18_2020-05-22.txt
covid_usa_week19_2020-05-29.txt
covid_usa_week20_2020-06-05.txt
covid_usa_week21_2020-06-12.txt
covid_usa_week22_2020-06-19.txt
covid_usa_week23_2020-06-26.txt
covid_usa_week24_2020-07-03.txt
covid_usa_week25_2020-07-10.txt
covid_usa_week26_2020-07-17.txt
covid_usa_week27_2020-07-24.txt
covid_usa_week28_2020-07-31.txt
covid_usa_week29_2020-08-07.txt
covid_usa_week30_2020-08-14.txt
covid_usa_week31_2020-08-21.txt
covid_us

In [21]:
rs_df = pd.DataFrame(results)
pair_names = pd.DataFrame(all_pairs)
rs_df2 = pd.concat([pair_names, rs_df.T], axis=1)
rs_df2.to_csv("../results/simi_usa01-52_RI_non-neg.csv", index=False)