In [151]:
!pip install pyLDAvis



In [152]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [153]:
!python3 -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [154]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [7]:
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [9]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [10]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [12]:
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)]]


In [None]:
#Этот блок выдает ошибку "CalledProcessError"
mallet_path = r'C:\Users\Иришка\Downloads\mallet-2.0.8' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [13]:
def best_group():
  maxim = 0
  number = 0
  for i in range(10,30):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=i, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    if coherence_lda >= maxim:
      maxim = coherence_lda
      number = i
    return number

In [14]:
best_group()

KeyboardInterrupt: ignored

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=5,
                                           alpha='auto',
                                           per_word_topics=True)

In [16]:
topics = lda_model.show_topics(10, formatted=False)
#print(topics[0][1][1])

In [17]:
from collections import Counter

In [19]:
text_in_topic = {}
n=0
for text in texts:
  n+=1
  count = Counter()
  for word in text:
    for topic in topics:
      id = topic[0]
      for i in range(len(topic[1])):
        if topic[1][i][0] == word:
          count[id]+= topic[1][i][1]
  d = dict(count.most_common())
  max = 0
  i_cor = 0
  for i in d:
    if d.get(i) > max:
      max = d.get(i)
      i_cor = i
    text_in_topic.update({n : i_cor})
#print(text_in_topic[0])

In [20]:
groups = {}
for i in range(0,10):
  lis = []
  for key, value in text_in_topic.items():
    if value == i:
      lis.append(key)
    groups.update({i:lis})

In [21]:
def computeTF(t):
  numOfWords = dict.fromkeys(t, 0)
  for word in t:
    numOfWords[word] += 1
  tfDict = {}
  textCount = len(t)
  for word, count in numOfWords.items():
    tfDict[word] = count / float(textCount)
  return tfDict

In [22]:
tfs = {}
for i in range (0,10):
  num_text = groups[i]
  for n in num_text:
    need_text = texts[n-1]
    get_tf = computeTF(need_text) #список словарей (для каждого текста словарь со значениями)
    tfs.update({i: [n , get_tf]})
print(tfs[0])

[11288, {'wing': 0.036885245901639344, 'win': 0.01639344262295082, 'nntp_poste': 0.004098360655737705, 'host': 0.004098360655737705, 'ca': 0.004098360655737705, 'reply': 0.004098360655737705, 'line': 0.004098360655737705, 'article': 0.00819672131147541, 'write': 0.012295081967213115, 'also': 0.012295081967213115, 'think': 0.00819672131147541, 'hard': 0.012295081967213115, 'time': 0.020491803278688523, 'sportswriter': 0.004098360655737705, 'predict': 0.004098360655737705, 'probably': 0.00819672131147541, 'best': 0.004098360655737705, 'ever': 0.004098360655737705, 'see': 0.004098360655737705, 'figure': 0.004098360655737705, 'take': 0.004098360655737705, 'pretty': 0.012295081967213115, 'good': 0.028688524590163935, 'team': 0.05327868852459016, 'contend': 0.00819672131147541, 'look': 0.00819672131147541, 'unlikely': 0.00819672131147541, 'truly': 0.004098360655737705, 'great': 0.012295081967213115, 'would': 0.012295081967213115, 'able': 0.004098360655737705, 'make': 0.020491803278688523, 'm

In [23]:
import math

In [24]:
def computeIDF(documents):
    N = len(documents[0])
    idfDict = dict.fromkeys(documents[0], 0)
    for document in documents: 
        for word, val in document.items():
          if val > 0:
            if word in idfDict:
                idfDict[word] += 1
            else:
              idfDict.update({word:1})
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [25]:
idfs = {}
for i in range(0,10):
  num_tex = groups[i]
  for n in num_tex:
    ls = []
    new = []
    need_tex = texts[n-1]
    numOfWords_ = dict.fromkeys(need_tex, 0)
    for word in need_tex:
      numOfWords_[word] += 1
    ls.append(numOfWords_)
    #print(len(ls[0]))
  com_idf = computeIDF(ls)
  idfs.update({i: com_idf})
print(idfs)

{0: {'wing': 5.017279836814924, 'win': 5.017279836814924, 'nntp_poste': 5.017279836814924, 'host': 5.017279836814924, 'ca': 5.017279836814924, 'reply': 5.017279836814924, 'line': 5.017279836814924, 'article': 5.017279836814924, 'write': 5.017279836814924, 'also': 5.017279836814924, 'think': 5.017279836814924, 'hard': 5.017279836814924, 'time': 5.017279836814924, 'sportswriter': 5.017279836814924, 'predict': 5.017279836814924, 'probably': 5.017279836814924, 'best': 5.017279836814924, 'ever': 5.017279836814924, 'see': 5.017279836814924, 'figure': 5.017279836814924, 'take': 5.017279836814924, 'pretty': 5.017279836814924, 'good': 5.017279836814924, 'team': 5.017279836814924, 'contend': 5.017279836814924, 'look': 5.017279836814924, 'unlikely': 5.017279836814924, 'truly': 5.017279836814924, 'great': 5.017279836814924, 'would': 5.017279836814924, 'able': 5.017279836814924, 'make': 5.017279836814924, 'move': 5.017279836814924, 'push': 5.017279836814924, 'maybe': 5.017279836814924, 'maintain': 

In [None]:
from collections import Counter

In [149]:
all_tf_idf = []
for i in range (0,10):
  num = groups[i]
  ttfidf = []
  for n in num:
    need = texts[n-1]
    tf_ = tfs[i][1]
    idf_ = idfs[i]
    sor_words = []
    words = Counter()
    for word in need:
      if word in tf_ and word in idf_:
        tf = tf_[word]
        idf = idfs[i][word]
        tfidf = tf*idf
        #print(i,n,word,tfidf)
        words.update({word:tfidf})
    d = dict(words.most_common(5))
    sor_words.append(d)
    #print(sor_words[0])
    for w in sor_words:
      #print(w,n)
      for key, value in w.items():
        charact = [key , value, n, i]
        all_tf_idf.append(charact)
#print(all_tf_idf)

In [150]:
df = pd.DataFrame(all_tf_idf)
df.columns = ['word','tf_idf', 'text_number', 'topic']
df

Unnamed: 0,word,tf_idf,text_number,topic
0,year,1.028131,18,0
1,make,0.411252,18,0
2,get,0.370127,18,0
3,would,0.370127,18,0
4,good,0.287877,18,0
...,...,...,...,...
42884,enviroleague,1.337965,11277,9
42885,youth,1.057157,11277,9
42886,program,1.057157,11277,9
42887,organization,0.809386,11277,9


Coherence score - это показатель, измеряющий оценку одной темы (topic) путем оценки степени семантического сходства между словами в этой теме, обладающими наибольшими весами.