# Тематическое моделирование. BigARTM

Необходимо установить библиотеку BigARTM: http://docs.bigartm.org/en/latest/installation/linux.html

In [1]:
import artm

# Prepare data
# Case 1: data in CountVectorizer format
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from numpy import array

## Загрузка данных

In [2]:
categories = [
        'talk.politics.misc',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
        'rec.autos',
    ]
remove = ('headers', 'footers', 'quotes')

print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")

dataset = fetch_20newsgroups(subset='all', categories=categories,
                                shuffle=True, random_state=42, remove=remove)

print('data loaded')

Loading 20 newsgroups dataset for categories:
['talk.politics.misc', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.autos']
data loaded


## Предобработка данных

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

def preprocess_text(text):
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # stem the words
    # porter = PorterStemmer()
    # words = [porter.stem(word) for word in words]
    # lemmatize
    # lemmatizer = WordNetLemmatizer()
    # words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [4]:
print("Text preprocessing...")
for i in range(len(dataset.data)):
    dataset.data[i] = preprocess_text(dataset.data[i])
print(f'{i+1} texts were preprocessed')

Text preprocessing...
4353 texts were preprocessed


## Извлечение признаков

In [5]:
cv = CountVectorizer(max_features=1000, stop_words='english')
n_wd = array(cv.fit_transform(dataset.data).todense()).T
vocabulary = cv.get_feature_names()

In [6]:
bv = artm.BatchVectorizer(data_format='bow_n_wd',
                          n_wd=n_wd,
                          vocabulary=vocabulary)

## Обучение модели

In [7]:
# Learn simple LDA model (or you can use advanced artm.ARTM)
model = artm.LDA(num_topics=5, dictionary=bv.dictionary)
model.fit_offline(bv, num_collection_passes=20)

# Print results
model.get_top_tokens()

[['image',
  'jpeg',
  'file',
  'images',
  'available',
  'graphics',
  'software',
  'data',
  'files',
  'format'],
 ['nt',
  'like',
  'think',
  'good',
  'problem',
  'time',
  'people',
  'make',
  'way',
  'really'],
 ['nt',
  'president',
  'mr',
  'think',
  'going',
  'know',
  'stephanopoulos',
  'people',
  'said',
  'ms'],
 ['space',
  'car',
  'new',
  'earth',
  'launch',
  'nasa',
  'cars',
  'shuttle',
  'orbit',
  'mission'],
 ['people',
  'nt',
  'god',
  'jesus',
  'say',
  'know',
  'said',
  'life',
  'believe',
  'children']]