<a href="https://colab.research.google.com/github/LizzyZhang-tutu/NLP_Learning/blob/master/topic_modeler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from nltk.tokenize import RegexpTokenizer  
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import models, corpora
import nltk
nltk.download('stopwords')

# Load input data
def load_data(input_file):
    data = []
    with open(input_file, 'r') as f:
        for line in f.readlines():
            data.append(line[:-1])

    return data

# Processor function for tokenizing, removing stop 
# words, and stemming
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed
    
if __name__=='__main__':
    # Load input data
    data = load_data('data.txt')

    # Create a list for sentence tokens
    tokens = [process(x) for x in data]

    # Create a dictionary based on the sentence tokens 
    dict_tokens = corpora.Dictionary(tokens)
    # Create a document-term matrix
    doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]

    # Define the number of topics for the LDA model
    num_topics = 2

    # Generate the LDA model 
    ldamodel = models.ldamodel.LdaModel(doc_term_mat, 
            num_topics=num_topics, id2word=dict_tokens, passes=25)

    num_words = 5
    print('\nTop ' + str(num_words) + ' contributing words to each topic:')
    for item in ldamodel.print_topics(num_topics=num_topics, num_words=num_words):
        print('\nTopic', item[0])

        # Print the contributing words along with their relative contributions 
        list_of_strings = item[1].split(' + ')
        print(list_of_strings)
        for text in list_of_strings:
            weight = text.split('*')[0]
            word = text.split('*')[1]
            print(word, '==>', str(round(float(weight) * 100, 2)) + '%')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

Top 5 contributing words to each topic:

Topic 0
['0.053*"empir"', '0.038*"time"', '0.038*"histor"', '0.038*"peopl"', '0.038*"expand"']
"empir" ==> 5.3%
"time" ==> 3.8%
"histor" ==> 3.8%
"peopl" ==> 3.8%
"expand" ==> 3.8%

Topic 1
['0.034*"mathemat"', '0.024*"europ"', '0.024*"cultur"', '0.024*"structur"', '0.024*"set"']
"mathemat" ==> 3.4%
"europ" ==> 2.4%
"cultur" ==> 2.4%
"structur" ==> 2.4%
"set" ==> 2.4%
