In [1]:
import os
import re
import pickle
import json
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.utils import lemmatize
from sklearn.decomposition import PCA
from matplotlib import pyplot
import numpy as np
import evaluate
import scipy

## Word2Vec-SkipGram

### Load Corpus And Preprocessing

In [2]:
def remove_punctuations(row):
    """
    Remove common punctuations from the row
    
    :param row: a string without '\n'
    :return: original input with common punctuations removed
    """
    return re.sub("""[,.!'":;`^~\-<>]""", '', row)

def load_data_tolist(lemma = False):
    """
    A generator return a preprocessed line of sentence
    the yielded result is a list contains the tokens in each sentence
    """
    with open("./data/brown.txt") as corpus:
        while corpus:
            line = corpus.readline()
            # end of file
            if not line:
                break
            # 1. get rid of punctuations
            line = remove_punctuations(line)
            # 2. lower
            line = line.lower()
            # 3. if lemmatize
            if lemma:
                line = [lem.decode('utf-8').split('/')[0] for lem in lemmatize(line, min_length=1)]
            else:
                line = line.split()
            yield line


In [3]:
"""
Write lines to local disk to boost performance
"""
def write_to_file(output_fname, row_iterator):
    with open(output_fname, "w", encoding ="utf-8") as output_file:
        for row_list in row_iterator:
            row = " ".join(row_list)
            output_file.write(f"""{row}\n""")

def write_preprocessed_corpus():
#     print("Writing lemmatized corpus....")
#     lemmatized_corpus = load_data_tolist(True)
#     write_to_file("./data/lemmatized_brown.txt", lemmatized_corpus)
    
    print("Writing unlemmatized corpus....")
    unlemmatized_corpus = load_data_tolist(False)
    write_to_file("./data/unlemmatized_brown.txt", unlemmatized_corpus)
    print("Done!")
    
# write_preprocessed_corpus()

Writing lemmatized corpus....
Writing unlemmatized corpus....
Done!


## Training Word2Vec Models

In [4]:
def write_word2vec_model_tofile(model_input_file, verbose = True):
    """
    Training word2vec models with all parameter combinations
    and write these result to files in 'model' folder
    
    """
    # use un lemmatized corpus to train the model
    

    windows = (2,5,10)
    dimensions = (100,300,1000)
    negatives = (1,5,15)
    for w in windows:
        for dim in dimensions:
            for n in negatives:
                model = Word2Vec(corpus_file = model_input_file, workers=6, sg = 1, min_count= 5,
                     window= w, size = dim, negative = n)
                model_output_filename = f"model/w{w}s{dim}n{n}-word2vec.bin"
                if verbose:
                    print(f"Writing {model_output_filename}...")
                model.wv.save_word2vec_format(model_output_filename, binary= True)
    if verbose:
        print("Done!")

In [5]:
model_input_file = "./data/unlemmatized_brown.txt"
write_word2vec_model_tofile(model_input_file)

Writing model/w2s100n1-word2vec.bin...
Writing model/w2s100n5-word2vec.bin...
Writing model/w2s100n15-word2vec.bin...
Writing model/w2s300n1-word2vec.bin...
Writing model/w2s300n5-word2vec.bin...
Writing model/w2s300n15-word2vec.bin...
Writing model/w2s1000n1-word2vec.bin...
Writing model/w2s1000n5-word2vec.bin...
Writing model/w2s1000n15-word2vec.bin...
Writing model/w5s100n1-word2vec.bin...
Writing model/w5s100n5-word2vec.bin...
Writing model/w5s100n15-word2vec.bin...
Writing model/w5s300n1-word2vec.bin...
Writing model/w5s300n5-word2vec.bin...
Writing model/w5s300n15-word2vec.bin...
Writing model/w5s1000n1-word2vec.bin...
Writing model/w5s1000n5-word2vec.bin...
Writing model/w5s1000n15-word2vec.bin...
Writing model/w10s100n1-word2vec.bin...
Writing model/w10s100n5-word2vec.bin...
Writing model/w10s100n15-word2vec.bin...
Writing model/w10s300n1-word2vec.bin...
Writing model/w10s300n5-word2vec.bin...
Writing model/w10s300n15-word2vec.bin...
Writing model/w10s1000n1-word2vec.bin...
Wri

## Word2Vec Model Evaluation

In [6]:
def eval_word2vec_models(verbose = True):
    """
    Evaluating word2vec models and return all results in a dict
    """
    windows = (2,5,10)
    dimensions = (100,300,1000)
    negatives = (1,5,15)
    evaluation_results = {}
    for w in windows:
        for d in dimensions:
            for n in negatives:
                key = f"w{w}d{d}n{n}"
                model_output_filename = f"model/w{w}s{d}n{n}-word2vec.bin"
                if verbose:
                    print(f"Evaluating model {key}")
                evaluation_results[key] = evaluate.evaluate_models([model_output_filename], verbose=False)
    if verbose:
        print("Done!")
    return evaluation_results

In [7]:
"""
Evaluate
"""
w2v_eval_results = eval_word2vec_models()

Evaluating model w2d100n1
Evaluating model w2d100n5
Evaluating model w2d100n15
Evaluating model w2d300n1
Evaluating model w2d300n5
Evaluating model w2d300n15
Evaluating model w2d1000n1
Evaluating model w2d1000n5
Evaluating model w2d1000n15
Evaluating model w5d100n1
Evaluating model w5d100n5
Evaluating model w5d100n15
Evaluating model w5d300n1
Evaluating model w5d300n5
Evaluating model w5d300n15
Evaluating model w5d1000n1
Evaluating model w5d1000n5
Evaluating model w5d1000n15
Evaluating model w10d100n1
Evaluating model w10d100n5
Evaluating model w10d100n15
Evaluating model w10d300n1
Evaluating model w10d300n5
Evaluating model w10d300n15
Evaluating model w10d1000n1
Evaluating model w10d1000n5
Evaluating model w10d1000n15
Done!


In [17]:
def print_eval_results(eval_results):
    """
    Print the evaluation result for each model.
    The metrics include
    1. correlation on WordSim
    2. BATS: male-female
    3. BATS: name-nationality
    4. BATS: things-color
    5. accuracy on MSR
    """
    for model in eval_results:
        result = eval_results[model][0]
        print(f"""
        Model:{model}
        WordSIM: {result['wordsim'].correlation}\t MSR: {result['msr']}
        BAT1: {result['bats']['E10 [male - female]']}\t BAT2: {result['bats']['E04 [name - nationality]']}\t, BAT3: {result['bats']['E09 [things - color]']}
        ==============================================
        """)


In [18]:
print_eval_results(w2v_eval_results)


        Model:w2d100n1
        WordSIM: 0.028208350592355845	 MSR: 0.672463768115942
        BAT1: 0.10526315789473684	 BAT2: 0.0	, BAT3: 0.15384615384615385
        

        Model:w2d100n5
        WordSIM: 0.2121214994403418	 MSR: 0.672463768115942
        BAT1: 0.10526315789473684	 BAT2: 0.0	, BAT3: 0.38461538461538464
        

        Model:w2d100n15
        WordSIM: 0.2380593677457007	 MSR: 0.6718840579710145
        BAT1: 0.2631578947368421	 BAT2: 0.05263157894736842	, BAT3: 0.15384615384615385
        

        Model:w2d300n1
        WordSIM: 0.023532043805981926	 MSR: 0.6666666666666666
        BAT1: 0.10526315789473684	 BAT2: 0.0	, BAT3: 0.15384615384615385
        

        Model:w2d300n5
        WordSIM: 0.21096234006103473	 MSR: 0.6718840579710145
        BAT1: 0.15789473684210525	 BAT2: 0.0	, BAT3: 0.19230769230769232
        

        Model:w2d300n15
        WordSIM: 0.23182137517251003	 MSR: 0.6759420289855073
        BAT1: 0.10526315789473684	 BAT2: 0.0	, BAT3: 0.2692

## SVD

### Creating Co-occurence Matrix

In [19]:
def create_co_occurence_matrix(corpus_filename, token_to_index, window):
    """
    Create co-occurence matrix using corpus from 'corpus_filename'
    
    :param corpus_filename: corpus file
    :param token_to_index: dict, mapping token to index, which is used to set the meaning of row/col
    :param window: window size of the co-occurence matrix
    
    :return: the created co-occurence matrix
    """
    word_len = len(token_to_index)
    co_occurence_mat = scipy.sparse.lil_matrix((word_len,word_len))
    
    with open(corpus_filename) as f:
        for line in f:
            line = line.split()
            for i, word in enumerate(line):
                for j in range(max(i-window,0),min(i + window + 1,len(line))):
                    # skip the word itself
                    if j == i: continue

                    co_occurence_mat[token_to_index[word],token_to_index[line[j]]]+=1

    return co_occurence_mat


In [20]:
"""
Get all tokens and build the transform dict
"""
model_input_file = "./data/unlemmatized_brown.txt"


with open(model_input_file) as f:
    corpus = f.read()
    tokens = set(corpus.split())

token_to_index = {}
for i, tok in enumerate(tokens):
    token_to_index[tok] = i
    
# save to file
token_to_index_file = "data/word_to_index.pkl"
with open(token_to_index_file, "wb") as f:
    pickle.dump(json.dumps(token_to_index), f)

In [21]:
"""
Load the token-index mapping back from disk
"""
with open(token_to_index_file, "rb") as f:
    token_to_index = json.loads(pickle.load(f))

In [23]:
"""
Calculate co-occurence matrix
"""
# w2coocurence_mat = create_co_occurence_matrix(model_input_file, token_to_index, 2)
# w5coocurence_mat = create_co_occurence_matrix(model_input_file, token_to_index, 5)
# w10coocurence_mat = create_co_occurence_matrix(model_input_file, token_to_index, 10)





In [24]:
"""
Save co-occurence matrix
"""
# scipy.sparse.save_npz("data/w2coocurence_mat.npz", scipy.sparse.coo_matrix(w2coocurence_mat) )
# scipy.sparse.save_npz("data/w5coocurence_mat.npz", scipy.sparse.coo_matrix(w5coocurence_mat) )
# scipy.sparse.save_npz("data/w10coocurence_mat.npz", scipy.sparse.coo_matrix(w10coocurence_mat) )





In [25]:
"""
Load back the co-occurence matrix
"""
w2coocurence_mat = scipy.sparse.load_npz("data/w2coocurence_mat.npz")
w5coocurence_mat = scipy.sparse.load_npz("data/w5coocurence_mat.npz")
w10coocurence_mat = scipy.sparse.load_npz("data/w10coocurence_mat.npz")

### Calculating PPMI Matrix

In [26]:
def calculate_ppmi_matrix(cooccurence_matrix, verbose = True):
    """
    Calculate PPMI matrix based on the input co-occurence matrix
    
    :return: PPMI matrix
    """
    ppmi_matrix = cooccurence_matrix.tolil()
    ppmi_row_matrix = cooccurence_matrix.tocsr()
    ppmi_col_matrix = cooccurence_matrix.tocsc()
    total = np.sum(ppmi_matrix)
    # memorizing col sum and row sum
    rowsum = []
    for i in range(ppmi_matrix.shape[0]):
        if verbose:
            print(f"\r Summing up row {i}  ", end = '')
        rowsum.append(ppmi_row_matrix[i,:].sum())
    colsum = []
    for j in range(ppmi_matrix.shape[1]):
        if verbose:
            print(f"\r Summing up col {j}  ", end = '')
        colsum.append(ppmi_col_matrix[:, j].sum())
    
    # update PPMI
    for row in range(ppmi_matrix.shape[0]):
        # print percentage
        if verbose:
            print(f"\rProcessing {row}/{ppmi_matrix.shape[0]} row", end = '')

        for col in range(ppmi_matrix.shape[1]):
            # skip 0 prob
            if ppmi_matrix[row,col] == 0: 
                continue
            PMI = np.log(ppmi_matrix[row, col] * total / rowsum[row] / colsum[col])
            # update on the ppmi_matrix
            ppmi_matrix[row, col] = max(0, PMI)
    if verbose:
        print("\r                                            ")
    return ppmi_matrix


In [27]:
# print("Calculating PPMI maxtrix for window size", 2)
# w2_ppmi_mat = calculate_ppmi_matrix(w2coocurence_mat)
# print("Calculating PPMI maxtrix for window size", 5)
# w5_ppmi_mat = calculate_ppmi_matrix(w5coocurence_mat)
# print("Calculating PPMI maxtrix for window size", 10)
# w10_ppmi_mat = calculate_ppmi_matrix(w10coocurence_mat)
# print("Done!")

Calculating PPMI maxtrix for window size 2
                                            
Calculating PPMI maxtrix for window size 5
                                            
Calculating PPMI maxtrix for window size 10
                                            
Done!


In [28]:
"""
Save PPMI matrix
"""
# scipy.sparse.save_npz("data/w2_ppmi_mat.npz", scipy.sparse.coo_matrix(w2_ppmi_mat) )
# scipy.sparse.save_npz("data/w5_ppmi_mat.npz", scipy.sparse.coo_matrix(w5_ppmi_mat) )
# scipy.sparse.save_npz("data/w10_ppmi_mat.npz", scipy.sparse.coo_matrix(w10_ppmi_mat) )

In [14]:
"""
Load PPMI matrix back
"""
# w2_ppmi_mat = scipy.sparse.load_npz("data/w2_ppmi_mat.npz")
# w5_ppmi_mat = scipy.sparse.load_npz("data/w5_ppmi_mat.npz")
# w10_ppmi_mat = scipy.sparse.load_npz("data/w10_ppmi_mat.npz")

### SVD Decomposition

In [32]:
def svd_decomposite(matrix, dimension):
    """
    Decomposite the input matrix and form the word matrix and context matrix
    
    :param matrix: matrix to be decomposed
    :param dimension: specify the number of dimension for word matrix
    
    :return: W, C. The word matrix and context matrix with each row representing a vector
    """
    U, S, Vt = scipy.sparse.linalg.svds(matrix, k = 100)
    sqrtS = np.sqrt(np.diag(S))
    W = np.dot(U, sqrtS)
    C = np.dot(Vt.T, sqrtS)
    return W, C

In [33]:
def write_svd_model_tofile(word_matrix, token_to_index, filename):
    """
    Write the word matrix to file in order to evaluate
    
    :param word_matrix: the word matrix to be written to disk
    :param token_to_index: dict, mapping token to index
    :param filename: a string that specify the file to be saved
    """
    
    with open(filename, "w") as f:
        for k in token_to_index:
            vector = word_matrix[token_to_index[k], :]
            # write word first
            f.write(f"{k} ")
            # write values
            for val in vector:
                f.write(f"{val} ")
            f.write("\n")
            

In [35]:
def write_all_svd_model_tofile(verbose = True):
    """
    Calculating the the 9 SVD models and write them to txt file
    """
    windows = (2,5,10)
    dimensions = (100,300,1000)
    for w in windows:
        ppmi_mat = scipy.sparse.load_npz(f"data/w{w}_ppmi_mat.npz")
        for d in dimensions:
            filename = f"model/w{w}s{d}-svd.txt"
            W, C = svd_decomposite(ppmi_mat, d)
            if verbose:
                print(f"Writing {filename}...")
            write_svd_model_tofile(W, token_to_index, filename)
    if verbose:
        print("Done")

In [36]:
write_all_svd_model_tofile()

Writing model/w2s100-svd.txt...
Writing model/w2s300-svd.txt...
Writing model/w2s1000-svd.txt...
Writing model/w5s100-svd.txt...
Writing model/w5s300-svd.txt...
Writing model/w5s1000-svd.txt...
Writing model/w10s100-svd.txt...
Writing model/w10s300-svd.txt...
Writing model/w10s1000-svd.txt...
Done


### Evaluate SVD Model

In [37]:
def eval_svd_models(verbose = True):
    """
    Evaluating svd models and return all results in a dict
    """
    windows = (2,5,10)
    dimensions = (100,300,1000)
    
    evaluation_results = {}
    for w in windows:
        for d in dimensions:
                key = f"svd-w{w}d{d}"
                model_output_filename = f"model/w{w}s{d}-svd.txt"
                if verbose:
                    print(f"Evaluating model {key}")
                evaluation_results[key] = evaluate.evaluate_models([model_output_filename], verbose=False)
    if verbose:
        print("Done!")
    return evaluation_results

In [38]:
# evaluate.evaluate_models(["model/w2s100-svd.txt"])
svd_eval_results = eval_svd_models()

Evaluating model svd-w2d100
Evaluating model svd-w2d300
Evaluating model svd-w2d1000
Evaluating model svd-w5d100
Evaluating model svd-w5d300
Evaluating model svd-w5d1000
Evaluating model svd-w10d100
Evaluating model svd-w10d300
Evaluating model svd-w10d1000
Done!


In [39]:
print_eval_results(svd_eval_results)


        Model:svd-w2d100
        WordSIM: -0.0623046117493807	 MSR: 0.6382608695652174
        BAT1: 0.0	 BAT2: 0.0	, BAT3: 0.0
        

        Model:svd-w2d300
        WordSIM: -0.0623046117493807	 MSR: 0.6382608695652174
        BAT1: 0.0	 BAT2: 0.0	, BAT3: 0.0
        

        Model:svd-w2d1000
        WordSIM: -0.0623046117493807	 MSR: 0.6382608695652174
        BAT1: 0.0	 BAT2: 0.0	, BAT3: 0.0
        

        Model:svd-w5d100
        WordSIM: -0.0035977035455661897	 MSR: 0.6353623188405797
        BAT1: 0.0	 BAT2: 0.0	, BAT3: 0.0
        

        Model:svd-w5d300
        WordSIM: -0.0035977035455661897	 MSR: 0.6353623188405797
        BAT1: 0.0	 BAT2: 0.0	, BAT3: 0.0
        

        Model:svd-w5d1000
        WordSIM: -0.0035977035455661897	 MSR: 0.6353623188405797
        BAT1: 0.0	 BAT2: 0.0	, BAT3: 0.0
        

        Model:svd-w10d100
        WordSIM: -0.006722224186856704	 MSR: 0.6342028985507246
        BAT1: 0.0	 BAT2: 0.0	, BAT3: 0.0
        

        Model:svd-w