In [38]:
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


  and should_run_async(code)


##### Text Cleaning

In [39]:
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

  and should_run_async(code)


In [40]:
def read_corpus(txt_file):
    
    with open(txt_file, encoding='utf-8') as file:
        # for each doc create list of pre-processed words in that doc
        doc = []

        for line in file:
            if "<doc" in line:  # tag for new doc
                document_words = []  # list of words that will be part of the document
                while True:
                    next_line = file.readline()  # read next line

                    # remove unuseful tags
                    next_line_proc = next_line.replace("<p> ", "").replace("</p>\n", "").replace("/p", "")

                    if "</doc>" in next_line:
                        break
                    # pre-processing steps
                    sentence_words = pre_processing(next_line_proc)
                    document_words.extend(sentence_words)
                doc.append(document_words)
        file.close()

    print("Documents number: ", len(doc))
    
    return doc

  and should_run_async(code)


##### Topic Modelling

In [41]:
num_topics = 10

def topic_modelling(doc):
    
    dictionary_LDA = corpora.Dictionary(doc)

    # delete all terms that do NOT appear in at least 3 documents.
    # delete all terms that appear in more than 60% of documents (see filter_extremes official doc).
    dictionary_LDA.filter_extremes(no_below=3, no_above=0.6)

    corpus_freq = [dictionary_LDA.doc2bow(doc) for doc in doc]
    
    lda_model = models.LdaModel(corpus_freq, num_topics=10 ,id2word=dictionary_LDA, passes=3, alpha='auto', chunksize=2000)

    for i in range(0, lda_model.num_topics-1):
        print("Topic ", i, ": \n",[lda_model.print_topic(i)])
    
    return lda_model, corpus_freq

  and should_run_async(code)


In [42]:
doc = read_corpus("Travel.txt")

#richiamo topic_modelling, e quindi printo i topic
model, corpus_freq = topic_modelling(doc)

  and should_run_async(code)


Documents number:  100
Topic  0 : 
 ['0.063*"clause" + 0.027*"result" + 0.026*"conditionals" + 0.024*"third" + 0.021*"example" + 0.019*"perfect" + 0.016*"happen" + 0.015*"continuous" + 0.015*"situation" + 0.014*"mixed"']
Topic  1 : 
 ['0.018*"travel" + 0.016*"book" + 0.012*"holiday" + 0.012*"article" + 0.011*"hotel" + 0.009*"journey" + 0.008*"word" + 0.008*"beach" + 0.007*"clause" + 0.007*"place"']
Topic  2 : 
 ['0.019*"word" + 0.009*"help" + 0.009*"example" + 0.009*"happen" + 0.009*"learn" + 0.009*"good" + 0.009*"language" + 0.008*"clause" + 0.008*"thing" + 0.007*"hotel"']
Topic  3 : 
 ['0.010*"late" + 0.009*"tomorrow" + 0.009*"travel" + 0.008*"happen" + 0.008*"ill" + 0.008*"video" + 0.008*"work" + 0.008*"eat" + 0.007*"holiday" + 0.007*"early"']
Topic  4 : 
 ['0.019*"video" + 0.018*"clause" + 0.018*"level" + 0.016*"esl" + 0.015*"grammar" + 0.014*"learn" + 0.012*"tense" + 0.011*"example" + 0.009*"conditionals" + 0.009*"teach"']
Topic  5 : 
 ['0.021*"travel" + 0.016*"holiday" + 0.012*"v

In [43]:
#mostro il topic per ogni documento
print ("Documents topic list")
for i in range (0, len(corpus_freq)):
    print ("Doc", i, ":", model[corpus_freq[i]])
    

Documents topic list
Doc 0 : [(5, 0.9878122)]
Doc 1 : [(1, 0.8631471), (7, 0.1347647)]
Doc 2 : [(1, 0.12854713), (2, 0.5736394), (8, 0.2861763)]
Doc 3 : [(9, 0.9957798)]
Doc 4 : [(2, 0.26327273), (8, 0.7355012)]
Doc 5 : [(2, 0.7392884), (8, 0.26029447)]
Doc 6 : [(4, 0.9920392)]
Doc 7 : [(2, 0.4964141), (5, 0.36238417), (8, 0.13399187)]
Doc 8 : [(1, 0.18589719), (5, 0.5219294), (9, 0.29164323)]
Doc 9 : [(5, 0.997503)]
Doc 10 : [(0, 0.67480475), (2, 0.32435063)]
Doc 11 : [(0, 0.15071198), (3, 0.8472919)]
Doc 12 : [(1, 0.8569436), (6, 0.1369561)]
Doc 13 : [(4, 0.2995933), (8, 0.6980324)]
Doc 14 : [(1, 0.21767865), (7, 0.42973882), (9, 0.3493616)]
Doc 15 : [(2, 0.10276475), (8, 0.8885409)]
Doc 16 : [(0, 0.36017317), (4, 0.6347861)]
Doc 17 : [(5, 0.9979459)]
Doc 18 : [(0, 0.25965118), (1, 0.6043306), (2, 0.11283725), (6, 0.021578042)]
Doc 19 : [(0, 0.7434608), (6, 0.2539717)]
Doc 20 : [(0, 0.020177035), (2, 0.9786355)]
Doc 21 : [(0, 0.9971991)]
Doc 22 : [(0, 0.9972531)]
Doc 23 : [(0, 0.1706

  and should_run_async(code)


#### Visualizzazione Most Relevant Term for Topic

In [44]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, corpus_freq, dictionary=model.id2word)

vis

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(
