## Support lib

In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib

%load_ext sql
%sql mysql+pymysql://root:Wqs977107@localhost/HW3_GOT_Raw
from sqlalchemy import create_engine

## Non-negative Matrix Factorization

In [None]:
class Matrix_Factor:
    def __init__(self, file_name, k):
        # initialize all the instance variables in class Matrix_Factor with reading from input file_name
        # file_name: path of data file to read
        # vocab: dictionary of all the vocabularies that appear in given document
        # document: indexes of vocabularies combined with the frequency of appearance in each document, seperated by \t
        # vlength: total number of vocabularies
        # dlength: total number of documents
        # X: calculate frequency matrix (vlength × dlength) by given document
        # k: the total number of top vocabularies in each topic

        self.vocab = self.read_file(file_name[0], seq=0)
        self.document = self.read_file(file_name[1], seq=1)
        self.vlength = len(self.vocab)
        self.dlength = len(self.document)
        self.X = self.generate_X()
        self.k = k

    def read_file(self, file_name, seq):
        # read file with file_name and return nparray format, remove encoding prefix \ufeff and last newline in file
        # parameter lists:
        # ------------------------------------------------------
        # file_name: path of the file to read
        # seq: number to differ input file, 0 as vocabulary file and 1 as document file

        with open(file_name, 'r', encoding='utf-8') as text:
            words = text.read()

        if seq:
            return np.array(words.replace('\ufeff', '').split('\n')[:-1])
        else:
            return np.array(words.replace('\ufeff', '').split('\n'))

    def generate_X(self):
        # calculate the frequency matrix (vlength × dlength) by given vocab and document

        X = np.zeros((self.vlength, self.dlength))
        for doc in range(self.dlength):
            index, value = [], []

            pair = re.split(',|:', self.document[doc])
            index = [int(i) - 1 for i in pair[::2]]
            value = [int(i) for i in pair[1::2]]

            X[index, doc] += value

        return X

    def NMF(self, rank=25, iteration=100, eps=10 ** (-16)):
        # factorize given matrix X by rank, iteration and eps
        # parameter list:
        # --------------------------------------------
        # rank: the total number of topics to be calculated by NMF
        # iteration: total iteration times
        # eps: the minimum value used to avoid 0/0

        N = self.X.shape[0]
        M = self.X.shape[1]

        # we initialize W and H with uniform(1,2) here, distribution can be changed with user definition
        W = np.random.uniform(1, 2, [N, rank])
        H = np.random.uniform(1, 2, [rank, M])
        obj = []

        for t in range(iteration):
            # update H
            mid = self.X / (W @ H + eps)
            Wt = W.T / np.sum(W.T, axis=1).reshape(-1, 1)
            H *= (Wt @ mid)

            # update W
            mid = self.X / (W @ H + eps)
            Ht = H.T / np.sum(H.T, axis=0).reshape(1, -1)
            W *= (mid @ Ht)

            obj.append(np.sum(self.X * np.log(1 / (W @ H + eps)) + W @ H))

        return W, H, obj

    def top_k(self, W):
        # calculate top k vocabularies with largest weights in each topic, given weights by each column in W
        # parameter list:
        # --------------------------------------------
        # W: matrix after NMF with dimension vlength × rank, i-th row in W represent the weight of i-th vocabulary

        W_norm = (W / np.sum(W, axis=0))

        top = []
        seq = [i for i in range(W_norm.shape[0])]

        for score in W_norm.T:
            dic = dict(zip(seq, score))
            dic = dict(sorted(dic.items(), key=lambda x: (x[1], x[0]), reverse=True))
            top.append([i + ': ' + str(j)[:6] for i, j in zip(self.vocab[list(dic.keys())[:self.k]],
                                                              list(dic.values())[:self.k])])

        return top

    def myplot(self, x, title='', xlabel='x', ylabel='y', figname='1.png'):
        # plot given parameter with details
        # parameter list:
        # --------------------------------------------
        # x: data to be plotted
        # title: plot title on fig, '' by default
        # xlabel: plot label of x on fig, 'x' by default
        # ylabel: plot lable of y on fig, 'y' by default
        # figname: fig name to save to loocal path, '1.png' by default

        plt.figure(figsize=(30, 30))
        matplotlib.rcParams['ytick.labelsize'] = 50
        plt.plot(x, linewidth=3)
        if title:
            plt.title(title, fontsize=50)
        if xlabel:
            plt.xlabel(xlabel, fontsize=50)
        if ylabel:
            plt.ylabel(ylabel, fontsize=50)
        plt.tick_params(labelsize=50)
        plt.savefig(figname, dpi=500)
        plt.show()
        plt.close()

    def run_sql(self, result, dbuser='dbuser', password='dbuser', schema_name='schema_name', table_name='table_name'):
        # put the calculation result to local database and display
        # parameter list:
        # --------------------------------------------
        # result: top-k vocabularies and their weights within each topic
        # dbuser: local database user name
        # password: local database password
        # schema_name: the schema in local database you wish to put new table in
        # table_name: new name of table to be put in database

        data = pd.DataFrame(result)
        con = create_engine("mysql+pymysql://{}:{}@localhost/{}".format(dbuser, password, schema_name))
        data.to_sql(table_name, con=con, index=True, if_exists="replace")
    #         df_index = pd.read_sql('SELECT * FROM {};'.format(table_name), con=con)
    #         print(df_index)

    def obj(self, rank, iteration, eps):
        # calculate the objective function value by iteration and top-k vocabularies in each topic
        # parameter list:
        # --------------------------------------------
        # rank: the total number of topics to be calculated by NMF
        # iteration: total iteration times
        # eps: the minimum value used to avoid 0/0

        W, H, obj = self.NMF(rank=rank, iteration=iteration, eps=eps)
        self.myplot(obj, title='Value of Objective Function with Iteration', xlabel='Iteration Times',
                    ylabel='Objective Function Value', figname='obj.png')

        top_k = self.top_k(W)
        result = []
        for i in top_k:
            result.append(', '.join(i))
        result = np.array(result).reshape((5, 5))
        self.run_sql(result)

## Main Function

In [None]:
def main():
    # main function to call the class Matrix_Factor and calculation
    # user defined parameters to be specify
    # -------------------------------------
    # file_name: the data to be imported, format as a list with the first for vocabulary and the second for document
    # k: the total number of the most top vocabularies within each topic
    # rank: the total number of topics to be calculated by NMF
    # iteration: total iteration times
    # eps: the minimum value used to avoid 0/0

    file_name = ['./nyt_vocab.dat', './nyt_data.txt']
    matrix = Matrix_Factor(file_name, k=10)
    matrix.obj(rank=25, iteration=100, eps=10 ** (-16))

if __name__ == '__main__':
    main()

## Sql Result Below

table_name below is the table that created above in local database

In [None]:
%sql select * from table_name