# Import

In [62]:

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer 
from gensim import models, corpora
from nltk.corpus import stopwords

# Loading the input data

In [63]:

def load_data(input_file):
    data = []
    with open(input_file, 'r') as f:
        for line in f.readlines():
            data.append(line[:-1])

    return data

# Function 'process' processess the input text

In [64]:

def process(input_text):
    # Regular expression tokenizer # Snowball stemmer # Getting the list of stop words 
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer('english')
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed
    

# Main

In [66]:
if __name__=='__main__':
    # Load input data
    data = load_data('text.txt')
    tokens = [process(x) for x in data]

    # A dictionary based on the sentence tokens 
    dict_tokens = corpora.Dictionary(tokens)
        
    # document-term matrix
    doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]

    # Define the number of topics for the LDA model
    num_topics = 4

    # Generate the LDA model 
    ldamodel = models.ldamodel.LdaModel(doc_term_mat, 
            num_topics=num_topics, id2word=dict_tokens, passes=25)

    num_words = 5
    print('\nTop ' + str(num_words) + ' contributing words to each topic:')
    for item in ldamodel.print_topics(num_topics=num_topics, num_words=num_words):
        print('\nTopic', item[0])

        # Print the contributing words along with their relative contributions 
        list_of_strings = item[1].split(' + ')
        for text in list_of_strings:
            weight = text.split('*')[0]
            word = text.split('*')[1]
            print(word, '==>', str(round(float(weight) * 100, 2)) + '%')




Top 5 contributing words to each topic:

Topic 0
"â" ==> 2.7%
"nlp" ==> 2.3%
"text" ==> 1.4%
"word" ==> 1.4%
"use" ==> 1.4%

Topic 1
"generat" ==> 4.2%
"parser" ==> 3.2%
"lexer" ==> 3.2%
"token" ==> 2.2%
"lex" ==> 2.2%

Topic 2
"languag" ==> 2.5%
"lalr" ==> 2.5%
"right" ==> 1.8%
"bottom" ==> 1.8%
"process" ==> 1.8%

Topic 3
"parser" ==> 3.4%
"implement" ==> 2.4%
"top" ==> 2.4%
"origin" ==> 1.3%
"grammar" ==> 1.3%
