In [30]:
import numpy as np
import pandas as pd
import pickle
import gensim
import nltk
from nltk.corpus import stopwords as n_stp
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models, similarities
import pyLDAvis.gensim
import re
from nltk.stem import WordNetLemmatizer

In [31]:
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bounouamustapha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bounouamustapha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Constant

In [32]:

FILE = "https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/dataset.csv"
STOP_WORDS = en_stop
NB_TOPIC = 5

## Load text 

In [33]:

df = pd.read_csv(FILE,delimiter='\n',header=None ).rename(columns={0: "text"})

In [34]:
df

Unnamed: 0,text
0,Innovation in Database Management: Computer Sc...
1,High performance prime field multiplication fo...
2,enchanted scissors: a scissor interface for su...
3,Detection of channel degradation attack by Int...
4,Pinning a Complex Network through the Betweenn...
...,...
2502,A new QR-decomposition based recursive frequen...
2503,CNN Implementation of Spin Filters for Electro...
2504,FaceKit: A Database Interface Design Toolkit.
2505,On the trade-off between the number of scrolls...


## Preprocess

In [35]:
## Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to stemming but 
## it brings context to the words. So it links words with similar meaning to one word.

def get_lemma(word):
    lemmatizer = WordNetLemmatizer() 
    return lemmatizer.lemmatize(word)
## example 
print(get_lemma("rocks"))


rock


In [36]:
def remove_punctuation(words):
    new_words = []
    for word in words.split():
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [37]:
### clean a sentence 

def clean_sentence(text):
 
    text = re.sub('@[^\s]+', '', text) ## remove @@ mentions

    text = re.sub('((www.[^\s]+)|(https?://[^\s]+))', '', text) ## remove urls

    text = re.sub(r'\d+', '',  text)## remove digits

    text = text.lower() ## Miniscule

    tokens = remove_punctuation(text)

    tokens = [get_lemma(token) for token in tokens if ( token not in STOP_WORDS and len(token) > 2)]
    
    return tokens


clean_sentence("Computer is 98 and for a Hello world @adel ?..")

  text = re.sub('@[^\s]+', '', text) ## remove @@ mentions
  text = re.sub('((www.[^\s]+)|(https?://[^\s]+))', '', text) ## remove urls


['computer', 'hello', 'world']

In [38]:
## return an array of words
def prepare_for_lda(df):
    data = df["text"].to_numpy()
    data = [clean_sentence(sentence) for sentence in data ]
    #return np.concatenate( data, axis=0 )
    return data


In [39]:
dataset = prepare_for_lda(df)

In [40]:
len(dataset)

2507

In [41]:
## Lda
def ldaTopTopics(texts, total_topics, nbwords):
    
    
    ## create a dictionnary
    ## for each word we assign a number
    dictionary = corpora.Dictionary(texts)
    
    ## Create a corpus
    
    ## We transform our dataset using the dictionnary
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    
    ## Save the corpus
    pickle.dump(corpus, open('data/corpus.pkl', 'wb'))
    ## Save the dictionnary
    dictionary.save('data/dictionary.gensim')
    
    
    
    ## use tfidf to vetorize the corpus
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    
    ## Build the clustering Model
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=NB_TOPIC)
    
    ## Save the Model
    lda.save('data/model.gensim')
    
    
    
    ##
    corpus_lda = lda[corpus_tfidf]
    lda.show_topics(NB_TOPIC, nbwords)
    
    return lda




In [42]:
lda_model = ldaTopTopics(dataset,NB_TOPIC,nbwords=10)
lda_model.show_topics(NB_TOPIC, 10)


[(0,
  '0.020*"data" + 0.013*"system" + 0.010*"based" + 0.008*"information" + 0.007*"architecture" + 0.006*"high" + 0.006*"base" + 0.006*"web" + 0.005*"power" + 0.005*"fast"'),
 (1,
  '0.014*"web" + 0.010*"model" + 0.009*"new" + 0.008*"query" + 0.008*"design" + 0.007*"application" + 0.007*"service" + 0.007*"using" + 0.006*"based" + 0.006*"linear"'),
 (2,
  '0.026*"network" + 0.022*"system" + 0.017*"wireless" + 0.009*"sensor" + 0.008*"control" + 0.007*"database" + 0.007*"using" + 0.006*"routing" + 0.006*"design" + 0.006*"coding"'),
 (3,
  '0.022*"network" + 0.012*"algorithm" + 0.010*"internet" + 0.008*"analysis" + 0.007*"application" + 0.007*"query" + 0.007*"search" + 0.007*"data" + 0.006*"design" + 0.005*"low"'),
 (4,
  '0.016*"using" + 0.014*"web" + 0.013*"data" + 0.012*"network" + 0.008*"mobile" + 0.008*"system" + 0.007*"sensor" + 0.005*"control" + 0.005*"approach" + 0.005*"wireless"')]

### VISUALIZATION 

In [43]:
dictionary = gensim.corpora.Dictionary.load('data/dictionary.gensim')
corpus = pickle.load(open('data/corpus.pkl', 'rb'))
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
## Saliency: a measure of how much the term tells you about the topic. 