In [50]:
import re
import pandas as pd
class Ngram:
    def chick(self, text):
        """
        Chick input data type
        """       
        if not isinstance(text, str):
            raise ValueError("input data must be string type")
        return text

    def filer_text(self, text):
        """
        Replace all Non-alphanumeric characters with spaces
        """
        clean_text = text.lower()
        clean_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', clean_text).replace('\n','')
        return clean_text

    @classmethod
    def get_ngrams(cls, text, n):
        """
        Get n-grams word group
        Parameters
        ---------
        text: string

        Returns
        ---------
        ngrams: generator

        Raises
        ---------        
        """ 
        text = cls().chick(text)      
        text = cls().filer_text(text)
        tokens = [token for token in text.split(" ") if token != ""]
        if len(tokens) < n:
            n = len(tokens)
        ngrams = zip(*[tokens[i:] for i in range(n)])
        for ngram in ngrams:
            yield " ".join(ngram)


text = "If a user set their own [setting to auto delete cloud recordings](https://support.zoom.us/hc/en-us/articles/115005756143), it would will take effect, not the [group-level setting](https://support.zoom.us/hc/en-us/articles/204519819).\n\n"
list(Ngram.get_ngrams(text,2))

['if a',
 'a user',
 'user set',
 'set their',
 'their own',
 'own setting',
 'setting to',
 'to auto',
 'auto delete',
 'delete cloud',
 'cloud recordings',
 'recordings https',
 'https support',
 'support zoom',
 'zoom us',
 'us hc',
 'hc en',
 'en us',
 'us articles',
 'articles 115005756143',
 '115005756143 it',
 'it would',
 'would will',
 'will take',
 'take effect',
 'effect not',
 'not the',
 'the group',
 'group level',
 'level setting',
 'setting https',
 'https support',
 'support zoom',
 'zoom us',
 'us hc',
 'hc en',
 'en us',
 'us articles',
 'articles 204519819']

In [51]:
from collections import Counter
import math
from pprint import  pprint
import numpy as np

class Tfidf:
    def __init__(self, corpus, ngram):
        self.corpus = corpus
        self.ngram = ngram

    def corpus_ngram(self):
        for text in self.corpus:
            for ngram in Ngram.get_ngrams(text, self.ngram):
                yield ngram

    def doc_frequency(self):
        doc_frequency_dic = {}
        for unique_ngram in set(self.corpus_ngram()):
            for text in self.corpus:
                if unique_ngram in set(Ngram.get_ngrams(text, self.ngram)):
                    doc_frequency_dic[unique_ngram]  = doc_frequency_dic.get(unique_ngram, 0) + 1
        return doc_frequency_dic 

    def frequency_tf(self):
        token_dic = Counter(list(self.corpus_ngram()))
        token_count = [token_dic.get(token) / sum(token_dic.values()) for token in token_dic.keys()]
        token_frequency = dict(token_dic.items())
        tf_dic = dict(zip(token_dic.keys(), token_count))
        return token_frequency, tf_dic

    def idf(self):
        idf_dic = {}
        doc_frequency_dic = self.doc_frequency()
        for unique_ngram in set(self.corpus_ngram()):
            idf_dic[unique_ngram] = math.log(len(self.corpus) / (1 + doc_frequency_dic.get(unique_ngram, 0)))
        return idf_dic

    def tfidf(self):
        tf_dic = self.frequency_tf()[1]
        idf_dic = self.idf()
        tfidf = {}
        for unique_ngram in set(self.corpus_ngram()):
            tfidf[unique_ngram] = tf_dic.get(unique_ngram) * idf_dic.get(unique_ngram)
        # norm = np.sqrt(sum([v**2 for v in tfidf.values()]))
        # tfidf = dict(zip(tfidf.keys(), [v/norm for v in tfidf.values()]))
        return tfidf
    
    def global_vocab(self):
        df = pd.DataFrame([self.doc_frequency(), self.frequency_tf()[0], self.idf(), self.tfidf()])
        df = df.T
        df.columns=['doc_frequency','frequency_tf', 'idf', 'tfidf']
        df.reset_index(inplace=True)
        df.rename(columns={"index":'token'}, inplace=True)
        df.insert(1,'ngram',self.ngram)
        df.sort_values(by=['doc_frequency'], ascending=False, inplace=True)
        df.to_json(path_or_buf='global_vocab.json', orient='records', force_ascii=True)
        return df

data = pd.read_json('corpus.json')[:2]
corpus = data['body']
ngram = 2

tfidf = Tfidf(corpus, ngram)
# pprint(f"frequency:{tfidf.frequency_tf()[0]},  tf: {tfidf.frequency_tf()[1]}")
# pprint(f"doc_frequency:{tfidf.doc_frequency()}")
# pprint(f"tfidf: {tfidf.tfidf()}")
global_vocab = tfidf.global_vocab()
global_vocab

Unnamed: 0,token,ngram,doc_frequency,frequency_tf,idf,tfidf
33,us hc,2,2.0,3.0,-0.405465,-0.016005
6,us articles,2,2.0,3.0,-0.405465,-0.016005
32,support zoom,2,2.0,3.0,-0.405465,-0.016005
24,en us,2,2.0,3.0,-0.405465,-0.016005
8,zoom us,2,2.0,4.0,-0.405465,-0.02134
9,https support,2,2.0,3.0,-0.405465,-0.016005
17,hc en,2,2.0,3.0,-0.405465,-0.016005
37,would will,2,1.0,1.0,0.0,0.0
38,delete cloud,2,1.0,1.0,0.0,0.0
36,their own,2,1.0,1.0,0.0,0.0


In [57]:

class CorpusTokens:
    def __init__(self, data, global_vocab):
        self.data = data
        self.global_vocab = global_vocab

    def text_with_tokens(self, text):
        tokens = [self.global_vocab[self.global_vocab['token']==unique_ngram].to_dict(orient='record')[0] \
                for unique_ngram in set(Ngram.get_ngrams(text, ngram))]
        return tokens

    def __call__(self):
        corpus_tokens = self.data.copy()
        corpus_tokens['token'] = corpus_tokens['body'].apply(self.text_with_tokens)
        corpus_tokens.to_json(path_or_buf='corpus_with_tokens.json', orient='records', force_ascii=True)
        return corpus_tokens

corpus_tokens = CorpusTokens(data, global_vocab)
corpus_with_tokens = corpus_tokens()
corpus_with_tokens

Unnamed: 0,id,pageTitle,subTitles,body,token
0,c2403ca8e7137ab026ce2f849a4efc85,Frequently asked questions,"[My Zoom account, Where can I find my account ...",You can find your account owner on your [Accou...,"[{'token': 'you can', 'ngram': 2, 'doc_frequen..."
1,61a8a773134071a2604a6788990f778c,Frequently asked questions about local and clo...,"[Cloud recording admin, Why is the auto delete...",If a user set their own [setting to auto delet...,"[{'token': 'take effect', 'ngram': 2, 'doc_fre..."


## sklearn ngram tfidf

In [None]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tf(corpus, ngram):
    vectorizer = CountVectorizer(ngram_range=(ngram, ngram)) 
    X = vectorizer.fit_transform(corpus)  
    words = vectorizer.get_feature_names()   
    df_word_tf = pd.DataFrame(X.toarray(),columns=words) 
    word_tf_sum = df_word_tf.sum().sort_values(ascending=False)
    return  word_tf_sum

def calculate_tfidf(corpus, ngram):    
    vectorizer = CountVectorizer(ngram_range=(ngram, ngram))    
    X = vectorizer.fit_transform(corpus)  
    words = vectorizer.get_feature_names()   
    transformer = TfidfTransformer()  
    tfidf = transformer.fit_transform(X)
    df_wordFreq = pd.DataFrame(tfidf.toarray(),columns=words) 
    print(df_wordFreq)
    wordFreqSum = df_wordFreq.sum().sort_values(ascending=False) #计算每个特征的总词频并按照降序进行排序
    return wordFreqSum

def calculate_tfidf2(corpus, ngram):    
    vectorizer = TfidfVectorizer(ngram_range=(ngram, ngram))  
    X = vectorizer.fit_transform(corpus)  
    words = vectorizer.get_feature_names()   
    df_wordFreq = pd.DataFrame(X.toarray(),columns=words)  
    wordFreqSum = df_wordFreq.sum().sort_values(ascending=False) #计算每个特征的总词频并按照降序进行排序
    return wordFreqSum

calculate_tf(corpus, ngram)
calculate_tfidf(corpus, ngram)
# calculate_tfidf2(corpus, ngram)

   115005731743 account  115005756143 it  about the  account owner  \
0              0.154189          0.00000   0.154189       0.154189   
1              0.000000          0.16185   0.000000       0.000000   

   account page  account profile  articles 115005731743  \
0      0.154189         0.462566               0.154189   
1      0.000000         0.000000               0.000000   

   articles 115005756143  articles 204519819  auto delete  ...  us articles  \
0                0.00000             0.00000      0.00000  ...     0.109707   
1                0.16185             0.16185      0.16185  ...     0.230315   

      us hc  user set  web portal  will take  would will   you can  \
0  0.109707   0.00000    0.154189    0.00000     0.00000  0.154189   
1  0.230315   0.16185    0.000000    0.16185     0.16185  0.000000   

   your account   zoom us  zoom web  
0      0.308378  0.219413  0.154189  
1      0.000000  0.230315  0.000000  

[2 rows x 57 columns]


account profile          0.462566
zoom us                  0.449728
us hc                    0.340021
https support            0.340021
hc en                    0.340021
support zoom             0.340021
us articles              0.340021
en us                    0.340021
your account             0.308378
to auto                  0.161850
level setting            0.161850
not the                  0.161850
own setting              0.161850
115005756143 it          0.161850
it would                 0.161850
the group                0.161850
take effect              0.161850
recordings https         0.161850
set their                0.161850
their own                0.161850
user set                 0.161850
if user                  0.161850
cloud recordings         0.161850
would will               0.161850
articles 115005756143    0.161850
articles 204519819       0.161850
auto delete              0.161850
setting to               0.161850
will take                0.161850
delete cloud  