In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os


In [None]:
class Affine:
    def __init__(self,w,b):
        self.params = [w,b]
    def forward(self,x):
        W,b = self.params
        return np.dot(x,W) + b

class Sigmoid:
    def forward(self,x):
        return 1/(1+np.exp(-x))
    
def print_result(current, total):
    lis = ['[' if i == 0 else ']' if i == 21 else ' ' for i in range(22)]
    index = int(current/total*20)
    percentage = format(current*100 / total, '.2f')
    if 0 <= index < 20:
        pass
    else:
        index = 20
    if index > 0:
        for i in range(1,index+1):
            lis[i] = u'\u25A0'
        string = ''.join(lis)
        print(f'\r{string} {percentage}%', end='', flush=True)
    else:
        string = ''.join(lis)
        print(f'\r{string} {percentage}%', end='', flush=True)


In [None]:
class Preprocess:
    def __init__(self, text: str, *args):
        dictionary = {i: f' {i}' for i in args}
        text = text.lower()
        for i in dictionary:
            text = text.replace(i, dictionary.get(i))
        self.text = text.split(' ')
        self.repeated = []
        
    def get_word_id(self):
        dictionary = {}
        dictionary2 = {}
        corpus = []
        append = corpus.append
        counter = 0
        for index, i in enumerate(self.text):
            if i not in dictionary:
                dictionary[i] = counter
                dictionary2[counter] = i
                counter += 1
                append(dictionary[i])
            else:
                self.repeated.append(index)
        return dictionary, dictionary2, corpus

    def get_single_context(self,id_word:dict, word_id:dict, corpus: list, word: str,window: int):  # list bound check
        word = word.lower()
        if word not in self.text:
            return
        ls = [0] * len(corpus)
        for index, i in enumerate(self.text):
            if word_id[i] == word_id[word]:    
                if index == 0:
                    counter = 1
                    for k in range(window):
                        ls[counter] += 1
                        counter += 1
                elif index == len(self.text) - 1:
                    counter = 1
                    for p in range(window):
                        ls[-1-counter] += 1
                        counter += 1
                else:
                    counter = counter2 = 1
                    word1_id = word_id[self.text[index - counter]]
                    word2_id = word_id[self.text[index + counter2]]
                    for p in range(window):
                        ls[word1_id] += 1
                        ls[word2_id] += 1
                        counter += 1
                        counter2 += 1
        return np.array(ls)

    def get_coocurrenceMatrix(self,corpus: list,id_word: dict, word_id: dict, window:int):
        ls = [self.get_single_context(id_word, word_id, corpus, i, window) for i in word_id]
        return np.array(ls), ls
    
def most_similar(matrix:list, word:str,word_id:dict, top:int):
    word = word.lower()
    if word not in word_id:
        return
    word_use_vector = matrix[word_id[word]]
    ls = {id_word[index]:similarity(word_use_vector, i) for index, i in enumerate(matrix) if index is not word_id[word]}
    return sorted(ls.items(),key=lambda x:x[1],reverse=True)[:top]
        
def similarity(vect1, vect2):
    x = vect1/(np.sqrt(np.sum(vect1**2)) + 1e-8)
    y = vect2/(np.sqrt(np.sum(vect2**2)) + 1e-8)
    return np.dot(x,y)

def PPMI(co_matrix, corpus):
    ppmi_matrix = np.zeros_like(co_matrix, dtype=np.float32)
    N = np.sum(co_matrix)
    sigle_word = np.sum(co_matrix, axis = 0)
    total = co_matrix.shape[0]*co_matrix.shape[1]
    cols = co_matrix.shape[1]
    for i in range(co_matrix.shape[0]):
        for j in range(co_matrix.shape[1]):
            ppmi = np.log2(co_matrix[i,j]*N/(sigle_word[i]*sigle_word[j]) + 1e-8)
            ppmi_matrix[i,j] = max(0, ppmi)
            print_result(i*cols+j+1, total)
    return ppmi_matrix
    

In [None]:
with open('ptb.train.txt', mode="r") as fp:
        string = fp.read()

preprocessed = Preprocess(string, ',', '.')
word_id, id_word, corpus = preprocessed.get_word_id()
preprocessed.get_single_context(id_word, word_id, corpus, 'say', 1)
cooccurrence_matrix, matrix_list = preprocessed.get_coocurrenceMatrix(
    corpus, id_word, word_id, 1)
ppmi = PPMI(cooccurrence_matrix, corpus)
print(cooccurrence_matrix)

In [None]:
U,S,V = np.linalg.svd(ppmi)
U[0, :2]
U

In [None]:
for i in id_word:
    plt.annotate(id_word[i],(U[i][0], U[i][1]))
plt.scatter(U[:, 0],U[:, 1])