In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import WordNetLemmatizer 
from tqdm import tqdm
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Danila\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Danila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Danila\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
lemmatizer = WordNetLemmatizer()

for text in newsgroups_train.data:
    tmp  = nltk.word_tokenize(text)
    text = " ".join([lemmatizer.lemmatize(w) for w in tmp])


In [3]:
features   = 4500
components = 20
top_words  = 10

vectorizer = CountVectorizer(lowercase    = True, 
                             stop_words   = _stop_words.ENGLISH_STOP_WORDS,
                             analyzer     = 'word', 
                             binary       = True,
                             max_df       = 0.95, 
                             min_df       = 2,
                             max_features = features
)
# одновременно создали словарь и преобразовали строку в вектор
train = vectorizer.fit_transform(newsgroups_train.data).toarray()

In [4]:
len(vectorizer.vocabulary_)

4500

In [9]:
class customLDA:
    def __init__(self, components=10, alpha=None, beta=None, max_iter=50):
        self._components = components
        self._alpha      = alpha
        self._beta       = beta
        self._max_iter   = max_iter
                
        self._cnt_k = None    # колво слов в теге k по всем документам
        self._cnt_w = None    # колво раз сколько слово w было в теге k
        self._cnt_d = None    # количество вхождений тега k в документе d

        self._fit_Is = False

    def fit(self, main_matrix):
        self._cnt_k  = np.zeros(self._components)                                  
        self._cnt_w = np.zeros((self._components, main_matrix.shape[1]))          
        self._cnt_d = np.zeros((main_matrix.shape[0], self._components))         

        if self._alpha == None:  self._alpha = np.ones(self._components)
        if self._beta  == None:  self._beta  = np.ones(main_matrix.shape[1])

        documn_, word_ =  main_matrix.nonzero()
        z = np.random.choice(self._components, len(documn_))

        for i,j,k in zip(documn_, word_, z):
            self._cnt_k[k]    += 1
            self._cnt_w[k, j] += 1
            self._cnt_d[i, k] += 1
        
        for i in tqdm(range(self._max_iter)):
            for j in range(len(documn_)):
                current_word = word_[j]
                current_dc   = documn_[j]
                current_tag  = z[j]
                self._cnt_d[current_dc, current_tag] -= 1
                self._cnt_w[current_tag, current_word] -= 1
                self._cnt_k[current_tag] -= 1
                p = (self._cnt_d[current_dc, :] + self._alpha) * (self._cnt_w[:, current_word] + self._beta[current_word]) / (self._cnt_k + self._beta.sum())
                z[j] = np.random.choice(self._components, p = p / p.sum())
                self._cnt_d[current_dc, z[j]] += 1
                self._cnt_w[z[j], current_word] += 1
                self._cnt_k[z[j]] += 1
        
        self._fit_Is = True
        return self
    
    def get_table_tags_and_word(self):
        if self._fit_Is:
            return self._cnt_w

        
        

In [6]:
lda = customLDA(components, max_iter=50)
lda.fit(train)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [16:12<00:00, 19.45s/it]


<__main__.customLDA at 0x2593c40a790>

In [8]:
result = np.argsort(lda.get_table_tags_and_word(), axis=1)[:, -top_words:]

for i in range(components):
    matrix = np.zeros((1, train.shape[1]))
    for j in result[i]:
        matrix[0, j] = 1
    print('Tag {} \t{}'.format(i + 1, '\t'.join(vectorizer.inverse_transform(matrix)[0])))

Tag 1 	does	don	edu	good	just	know	like	soon	ve	work
Tag 2 	believe	bible	christian	does	fact	god	jesus	people	say	true
Tag 3 	better	don	going	good	like	make	think	time	ve	want
Tag 4 	card	disk	drive	hard	new	pc	sale	software	use	video
Tag 5 	government	history	law	military	people	rights	state	states	war	world
Tag 6 	does	help	hi	know	problem	program	thanks	use	using	windows
Tag 7 	does	don	just	know	like	people	right	say	think	way
Tag 8 	current	high	large	low	power	use	used	using	way	work
Tag 9 	came	day	did	got	old	said	saw	started	told	went
Tag 10 	bike	car	cars	engine	good	just	like	little	new	road
Tag 11 	1993	april	earth	information	nasa	national	research	science	space	university
Tag 12 	chip	clipper	encryption	government	key	keys	phone	public	use	using
Tag 13 	don	good	just	know	like	really	said	thing	think	ve
Tag 14 	ago	just	look	make	money	new	pay	time	year	years
Tag 15 	case	cause	certain	common	effect	people	question	similar	use	usually
Tag 16 	10	11	12	13	14	15	16	20	24	