# Екатерина Кострыкина БКЛ181
#### Homework 3

#### Imports

In [1]:
import re
import numpy as np
import pandas as pd
import random
from string import punctuation
from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Loading data

In [2]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


#### Preprocessing data

In [3]:
def clean(text):
    text = re.sub('\S*@\S*\s?', '', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub("\'", "", text)    
    text = ' '.join([i for i in text.split() if i not in punctuation])
    return text

In [4]:
df['text'] = [clean(i) for i in df['content']]
df.head()

Unnamed: 0,content,target,target_names,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos,From: (wheres my thing) Subject: WHAT car is t...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware,From: (Guy Kuo) Subject: SI Clock Poll Final C...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware,From: (Thomas E Willis) Subject: PB questions....
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics,From: (Joe Green) Subject: Re: Weitek P9000 Or...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space,From: (Jonathan McDowell) Subject: Re: Shuttle...


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df.text.tolist()))

In [6]:
bigrams = [b for l in data_words for b in zip(l[:-1], l[1:])]
freq = nltk.FreqDist(bigrams) #computes freq of occurrence
fdist = freq.keys() # sorted according to freq

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [9]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

mallet никак не работает, ни в колабе ни в джупитере

In [10]:
mallet_path = '/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

CalledProcessError: Command '/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input C:\Users\1\AppData\Local\Temp\b53c8_corpus.txt --output C:\Users\1\AppData\Local\Temp\b53c8_corpus.mallet' returned non-zero exit status 1.

Let's find the optimal number of topics for this dataset calculating the quality using coherence score

In [11]:
def get_number_of_topics(number_of_topics, corpus, id2word, data_lemmatized):
    coherence_scores = []
    for number in number_of_topics:
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=number, 
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)
        
        coherence_model_lda = CoherenceModel(model=lda_model, 
                                             texts=data_lemmatized, 
                                             dictionary=id2word, 
                                             coherence='c_v')
        
        coherence_lda = coherence_model_lda.get_coherence()
        print(f'number_of_topics = {number}')
        print(f'coherence score: {coherence_lda}')
        print('##################')
        coherence_scores.append((coherence_lda, number))
        
    coherence_scores = sorted(coherence_scores, reverse=True)
    print(f'optimal number of topics = {coherence_scores[0][1]}')
    print(f'best coherence score: {coherence_scores[0][0]}')
    return coherence_scores[0][1]

In [12]:
best_number_of_topics = get_number_of_topics([5, 8, 10, 15, 20], corpus, id2word, data_lemmatized)

number_of_topics = 5
coherence score: 0.45891486844074675
##################
number_of_topics = 8
coherence score: 0.482693972259848
##################
number_of_topics = 10
coherence score: 0.489854593921089
##################
number_of_topics = 15
coherence score: 0.44662937295144844
##################
number_of_topics = 20
coherence score: 0.4349993277965744
##################
optimal number of topics = 10
best coherence score: 0.489854593921089


The optimal number of topics for this dataset is **10**

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=best_number_of_topics, 
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    per_word_topics=True)

In [14]:
topics = lda_model.show_topics(best_number_of_topics, num_words=50, formatted=False) 
doc_lda = lda_model[corpus]

In [15]:
topics[0][1][:10]

[('year', 0.029839259),
 ('team', 0.025505481),
 ('game', 0.023295127),
 ('play', 0.017700817),
 ('win', 0.016438685),
 ('player', 0.0128361285),
 ('last', 0.010361701),
 ('choose', 0.009891362),
 ('next', 0.009800661),
 ('run', 0.009388538)]

Here we are going to find the main topics for each text from dataset

In [28]:
def get_topic_dic(topics):
    topic_dic = {}
    for idx, topic in enumerate(topics):
        dic = {}
        for word in topic[1]:
            dic[word[0]] = word[1]
        topic_dic[f'{idx}'] = dic
    return topic_dic


def get_main_topics(data, number_of_topics):
    main_topics = {}
    topic_dic = get_topic_dic(topics)
    for idx, text in enumerate(data):
        main_topic = {}
        for word in text:
            for num_topic in topic_dic.keys():
                if word in topic_dic[num_topic]:
                    main_topic[num_topic] = float(topic_dic[num_topic][word])
        if main_topic:
            main_topics[idx] = sorted(main_topic.items(), key=lambda x: -x[1])[0][0]
        else:
            main_topics[idx] = str(random.randint(0, number_of_topics-1))
    return main_topics

In [31]:
main_topics = get_main_topics(data_lemmatized, best_number_of_topics)

In [32]:
df_text = pd.DataFrame()
df_text['text'] = df['text']
df_text['lemmas'] = [' '.join(i) for i in data_lemmatized]
df_text['main_topic'] = pd.DataFrame(main_topics, index=[0]).T

In [33]:
df_text

Unnamed: 0,text,lemmas,main_topic
0,From: (wheres my thing) Subject: WHAT car is t...,where thing car nntp_poste host park line wond...,3
1,From: (Guy Kuo) Subject: SI Clock Poll Final C...,poll final call summary final call clock repor...,6
2,From: (Thomas E Willis) Subject: PB questions....,engineering computer network distribution_usa ...,1
3,From: (Joe Green) Subject: Re: Weitek P9000 Or...,division line host write write article know ch...,9
4,From: (Jonathan McDowell) Subject: Re: Shuttle...,question distribution article write clear caut...,4
...,...,...,...
11309,From: (Jim Zisfein) Subject: Re: Migraines and...,scan city reply line consultation cheap also w...,9
11310,From: Subject: Screen Death: Mac Plus/512 Line...,screen medford old problem screen blank someti...,6
11311,From: (Will Estes) Subject: Mounting CPU Coole...,este mount mail group line instal try mount co...,7
11312,From: (Steven Collins) Subject: Re: Sphere fro...,line article write boy embarasse trivial faq g...,1


just checking that everything is ok

In [34]:
df_text.main_topic.unique()

array(['3', '6', '1', '9', '4', '8', '0', '2', '5', '7'], dtype=object)

Computing TF-IDF for each topic  group and getting 5 words with the highest TF-IDF for every text in topic group

In [35]:
def get_tfidf_top5(text):
    feature_names = vectorizer.get_feature_names()
    feature_index = text[0,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [text[0, x] for x in feature_index])
    tfidf_scores = [(feature_names[i], s) for (i, s) in tfidf_scores]
    top5 = ', '.join([i[0] for i in sorted(tfidf_scores, key=lambda x: -x[1])[:5]])
    return top5

In [36]:
top5_tfidf = {}
for topic in df_text.main_topic.unique():
    data = df_text['lemmas'][df_text['main_topic'] == f'{topic}']
    idxs = df_text.index[df_text['main_topic'] == f'{topic}'].tolist()
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(data)
    for idx in idxs:
        text = vectorizer.transform([data.loc[idx]])
        top5 = get_tfidf_top5(text)
        top5_tfidf[f'{idx}'] = top5

In [38]:
df_text['top5_tfidf'] = pd.DataFrame(sorted(top5_tfidf.items(), key=lambda x: (int(x[0]), x[1])))[1]
df_text.head()

Unnamed: 0,text,lemmas,main_topic,top5_tfidf
0,From: (wheres my thing) Subject: WHAT car is t...,where thing car nntp_poste host park line wond...,3,"car, door, tellme, lerxst, funky"
1,From: (Guy Kuo) Subject: SI Clock Poll Final C...,poll final call summary final call clock repor...,6,"poll, clock, upgrade, final, speed"
2,From: (Thomas E Willis) Subject: PB questions....,engineering computer network distribution_usa ...,1,"powerbook, display, machine, bunch, hear"
3,From: (Joe Green) Subject: Re: Weitek P9000 Or...,division line host write write article know ch...,9,"division, chip, weitek, quadrilateral, winter"
4,From: (Jonathan McDowell) Subject: Re: Shuttle...,question distribution article write clear caut...,4,"error, warn, bug, memory, expect"


In [41]:
df_text.to_csv('homework_3.csv', index=False)

### Coherence score

* Это мера, используемая для оценки тематических моделей, то есть методов, которые автоматически генерируют темы из коллекции документов.
* Определяется как среднее или медианное значение всех косинусных близостей. 
* Чем выше coherence score, тем более связными получились темы.