In [1]:
import numpy as np
import pandas as pd
import pickle
import gensim
import nltk
from nltk.corpus import stopwords as n_stp
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models, similarities
import pyLDAvis.gensim
import re
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bounouamustapha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bounouamustapha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Constant

In [3]:

FILE = "https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/dataset.csv"
STOP_WORDS = en_stop
NB_TOPIC = 5

## Load text 

In [4]:

df = pd.read_csv(FILE,delimiter='\n',header=None ).rename(columns={0: "text"})

In [5]:
df

Unnamed: 0,text
0,Innovation in Database Management: Computer Sc...
1,High performance prime field multiplication fo...
2,enchanted scissors: a scissor interface for su...
3,Detection of channel degradation attack by Int...
4,Pinning a Complex Network through the Betweenn...
...,...
2502,A new QR-decomposition based recursive frequen...
2503,CNN Implementation of Spin Filters for Electro...
2504,FaceKit: A Database Interface Design Toolkit.
2505,On the trade-off between the number of scrolls...


## Preprocess

In [6]:
## Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but 
## it brings context to the words. So it links words with similar meaning to one word.

def get_lemma(word):
    lemmatizer = WordNetLemmatizer() 
    return lemmatizer.lemmatize(word)
## example 
print(get_lemma("rocks"))


rock


In [7]:
def remove_punctuation(words):
    new_words = []
    for word in words.split():
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [8]:
### clean a sentence 

def clean_sentence(text):
 
    text = re.sub('@[^\s]+', '', text) ## remove @@ mentions

    text = re.sub('((www.[^\s]+)|(https?://[^\s]+))', '', text) ## remove urls

    text = re.sub(r'\d+', '',  text)## remove digits

    text = text.lower() ## Miniscule

    tokens = remove_punctuation(text)

    tokens = [get_lemma(token) for token in tokens if ( token not in STOP_WORDS and len(token) > 2)]
    
    return tokens


clean_sentence("Computer is 98 and for a Hello world @adel ?..")

  text = re.sub('@[^\s]+', '', text) ## remove @@ mentions
  text = re.sub('((www.[^\s]+)|(https?://[^\s]+))', '', text) ## remove urls


['computer', 'hello', 'world']

In [9]:
## return an array of words
def prepare_for_lda(df):
    data = df["text"].to_numpy()
    data = [clean_sentence(sentence) for sentence in data ]
    #return np.concatenate( data, axis=0 )
    return data


In [10]:
dataset = prepare_for_lda(df)

In [11]:
len(dataset)

2507

In [15]:
## Lda
def ldaTopTopics(texts, total_topics, nbwords):
    
    
    ## create a dictionnary
    ## for each word we assign a number
    dictionary = corpora.Dictionary(texts)
    
    ## Create a corpus
    
    ## We transform our dataset using the dictionnary
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    
    ## Save the corpus
    pickle.dump(corpus, open('data/corpus.pkl', 'wb'))
    ## Save the dictionnary
    dictionary.save('data/dictionary.gensim')
    
    
    
    ## use tfidf to vetorize the corpus
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    
    ## Build the clustering Model
    lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=NB_TOPIC)
    
    ## Save the Model
    lda.save('data/model.gensim')
    
    ##
    lda.show_topics(NB_TOPIC, nbwords)
    
    return lda




In [16]:
lda_model = ldaTopTopics(dataset,NB_TOPIC,nbwords=10)
lda_model.show_topics(NB_TOPIC, 10)


[(0,
  '0.003*"algorithm" + 0.003*"system" + 0.003*"using" + 0.003*"admission" + 0.002*"translation" + 0.002*"cost" + 0.002*"general" + 0.002*"spectrum" + 0.002*"display" + 0.002*"interactive"'),
 (1,
  '0.007*"query" + 0.005*"method" + 0.005*"web" + 0.004*"system" + 0.004*"database" + 0.004*"data" + 0.003*"level" + 0.003*"fast" + 0.003*"video" + 0.003*"detection"'),
 (2,
  '0.008*"network" + 0.006*"wireless" + 0.005*"information" + 0.005*"data" + 0.004*"semantic" + 0.004*"design" + 0.004*"mobile" + 0.004*"hoc" + 0.004*"sensor" + 0.003*"system"'),
 (3,
  '0.005*"data" + 0.005*"network" + 0.004*"system" + 0.004*"application" + 0.004*"wireless" + 0.003*"ray" + 0.003*"database" + 0.003*"design" + 0.003*"web" + 0.003*"engine"'),
 (4,
  '0.006*"web" + 0.005*"data" + 0.004*"network" + 0.004*"new" + 0.004*"sensor" + 0.004*"internet" + 0.003*"search" + 0.003*"service" + 0.003*"reduction" + 0.003*"view"')]

### VISUALIZATION 

In [17]:
dictionary = gensim.corpora.Dictionary.load('data/dictionary.gensim')
corpus = pickle.load(open('data/corpus.pkl', 'rb'))
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
## Saliency: a measure of how much the term tells you about the topic. 