In [None]:
def word_wash(text, api_data_class, stop_word='default'):
    #determine language
    if stop_word == 'default':
        lang = api_data_class.get_lang(text)
        if lang == 'zho':
            stop_word = 'pacakge_data/cn_stopwords.txt'
        elif lang == 'eng':
            stop_word = 'pacakge_data/en_stopwords.txt'
        else:
            stop_word = 'pacakge_data/other_stopwords.txt'

    try:
        stop_file = open(stop_word, 'r', encoding='utf-8')
        stopwords = stop_file.read().split("\n")
        stop_file.close()
    except:
        print('{} is not exist, please check the file!'.format(stop_word))

    #check the characters limit (need a function)
    text_lst = check_limit(text)
    text_token = []
    for t in text_lst:
        text_token.extend(api_data_class.get_token(t))

    washed_token = []
    for char in text_token:
        if char in stopwords:
            pass
        else:
            washed_token.append(char)
    return washed_token

In [None]:
def cal_frequency(text, api_data_class, stop_word='default'):
    washed_token = word_wash(text, api_data_class, stop_word)

    counts = {}
    for word in washed_token:
        counts[word] = counts.get(word, 0) + 1

    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    return counts

In [None]:
def check_limit(text):
    text = text.replace('\n', ' ')
    if len(text.replace(' ', '')) <= 5000:
        return [text]
    else:
        true_text = []
        if '.' in text:
            true_text = check_limit_tool(text, '.')
        elif '。' in text:
            true_text = check_limit_tool(text, '。')
        else:
            true_text = check_limit_tool(text, '')
    return true_text


def check_limit_tool(text, symbol):
    temp = []
    string = ''
    if symbol != '':
        split_text = text.split(symbol)
    else:
        split_text = text
    for sen in split_text:
        temp_s = string + sen + symbol
        if len(temp_s.replace(' ', '')) > 5000:
            temp.append(string)
            string = sen + symbol
        else:
            string = string + sen + symbol

        if sen == split_text[-1]:
            temp.append(string)
    return temp

In [None]:
#file_loc: where the wordcloud pic and chart pic should be saved
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import altair as alt
from altair_saver import save


def word_viz(text,
             file_loc,
             api_data_class,
             top_num,
             stop_word='default',
             cloud_set=WordCloud(font_path='pacakge_data/STKAITI.TTF')):
    
    washed_token = word_wash(text, api_data_class, stop_word)
    
    cloud_set.generate(' '.join(washed_token))
    cloud_set.to_file(file_loc + 'cloud.png')

    counts = cal_frequency(text, api_data_class, stop_word)
    words = []
    count = []
    for i in counts:
        words.append(i[0])
        count.append(i[1])
    df = pd.DataFrame({'words': words, 'count': count})
    chart = (alt.Chart(df[:top_num]).mark_line().encode(
        x='words', y='count').properties(height=400, width=400))
    save(chart, "chart.html")

In [None]:
#input a document list
import numpy as np
import math

#text: text which needs to be calculate tfidf
#document: document_loc to train idf
#topK, keywords for top x
def key_extra_tfidf(text, document, api_data_class, stop_word='default'):
    text_wash_lst = []
    for doc in document:
        wash_token = word_wash(doc, api_data_class, stop_word)
        text_wash_lst.append(wash_token)
        
    text_wash = word_wash(text, api_data_class, stop_word)

    #word dictionary
    set_lst = []
    for i in text_wash_lst:
        set_lst.extend(i)
    wordSet = set(set_lst)

    wordDict_lst = []
    for wash_token in text_wash_lst:
        temp_dic = dict.fromkeys(wordSet, 0)
        for word in wash_token:
            temp_dic[word] += 1
        wordDict_lst.append(temp_dic)
        
    wordDict = dict.fromkeys(wordSet, 0)
    for word in text_wash:
        wordDict[word] += 1

    tfDict = cal_TF(wordDict, text_wash)
    idfDict = cal_IDF(wordDict_lst)

    tfidf = cal_TFIDF(tfDict, idfDict)

    return tfidf

def cal_TF(wordDict, wash_token):
    tfDict = {}
    token_count = len(wash_token)
    for word, count in wordDict.items():
        tfDict[word] = count / token_count
    return tfDict


def cal_IDF(wordDict_lst):
    idfDict = dict.fromkeys(wordDict_lst[0], 0)
    N = len(wordDict_lst)
    for wordDict in wordDict_lst:
        for word, count in wordDict.items():
            if count > 0:
                idfDict[word] += 1

    for word, ni in idfDict.items():
        idfDict[word] = math.log10(N / (ni + 1))

    return idfDict


def cal_TFIDF(tf, idf):
    tfidf = {}
    for word, tf_val in tf.items():
        tfidf[word] = tf_val * idf[word]
    return tfidf

In [None]:
import numpy as np
from scipy.spatial.distance import pdist
def cal_simi(text1, text2, api_data_class, size='sen', method='euc'):
    token_1 = api_data_class.get_token(text1)
    token_2 = api_data_class.get_token(text2)
    if size == 'sen':
        vec_1 = api_data_class.get_vec(text1)['documentEmbedding']
        vec_2 = api_data_class.get_vec(text2)['documentEmbedding']
    elif size == 'word':
        vec_1 = api_data_class.get_vec(text1)['tokenEmbeddings']
        vec_2 = api_data_class.get_vec(text2)['tokenEmbeddings']
        #padding
        max_matrix_length = max(len(vec_1),len(vec_2))
        add = np.zeros(300)
        if max_matrix_length == len(vec_1):
            vec_2.extend([list(add)]* (max_matrix_length-len(vec_2)))
            vec_2 = np.array(vec_2).flatten()
            vec_1 = np.array(vec_1).flatten()
        else:
            vec_1.extend([list(add)]* (max_matrix_length-len(vec_1)))
            vec_1 = np.array(vec_1).flatten()
            vec_2 = np.array(vec_2).flatten()
    else:
        print('There is no size '.format(size))

    if method == 'euc':
        score = simi_cal_euc(vec_1, vec_2, size)
    elif method == 'cos':
        score = simi_cal_cos(vec_1, vec_2, size)
    elif method == 'jac':
        score = simi_cal_jac(token_1, token_2, size)
    elif method == 'cheb':
        score = simi_cal_cheb(vec_1, vec_2, size)
    elif method == 'mah':
        score = simi_cal_mah(vec_1, vec_2, size)
    else:
        print('There is no method '.format(method))
        
    return score
        
def simi_cal_euc(vec1,vec2,size):
    if size=='word':
        pass
    else:
        vec1=np.array(vec1)
        vec2=np.array(vec2)
    return float(np.sqrt(np.sum(np.square(vec1-vec2))))
    
def simi_cal_cos(vec1,vec2,size):
    if size=='word':
        pass
    else:
        vec1=np.array(vec1)
        vec2=np.array(vec2)
    return float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
    
def simi_cal_mah(vec1, vec2, size):
    if size=='word':
        pass
    else:
        vec1=np.array(vec1)
        vec2=np.array(vec2)
    return float(np.sum(np.abs(vec1-vec2)))
    
def simi_cal_jac(token_1, token_2, size):
    inter = len(list(set(token_1).intersection(token_2)))
    union = (len(token_1) + len(token_1)) - inter
    return float(inter) / union

def simi_cal_cheb(vec1, vec2, size):
    if size == 'word':
        pass
    else:
        vec1=np.mat(vec1)
        vec2=np.mat(vec2)
    return float(np.max(np.abs(vec1-vec2)))