Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
106 lines (81 sloc) 3.83 KB
import os
import sys
import time
import re
import nltk
print(time.strftime('%Y/%m/%d %H:%M'))
print('OS: ', sys.platform)
print('CPU Cores:', os.cpu_count())
print('Python: ', sys.version)
print('NLTK: ', nltk.__version__)
### Tokenizing
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # By blank space
df = df.assign(tokenized_corpus=df['corpus'].map(tokenizer.tokenize)) # If the corpus is a column in a dataframe
### Removing stop words
filtered_words = []
for document in corpus:
word.lower() for word in document
if word.lower() not in nltk.corpus.stopwords.words('english')
### Lemmatizing
# Setting the Lemmatization object
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# Looping through the words and appending the lemmatized version to a list
stemmed_words = []
for row in df['tokens']:
# Verbs
# Adjectives
# Nouns
lemmatizer.lemmatize(word.lower()), 'a'), 'v')
for word in row
if word.lower() not in nltk.corpus.stopwords.words('english')])
### TF-IDF
# Creating the sklearn object
from sklearn.feature_extraction import text as sktext
tfidf = sktext.TfidfVectorizer(smooth_idf=False)
# Transforming our 'tokens' column into a TF-IDF matrix and then a data frame
tfidf_df = pd.DataFrame(tfidf.fit_transform(corpus).toarray(),
# Removing sparse columns
tfidf_df = tfidf_df[tfidf_df.columns[tfidf_df.sum() > 2.5]]
# Removing digits
tfidf_df = tfidf_df.filter(regex=r'^((?!\d).)*$')
### Topic modeling
def topic_model_lda(processed_corpus, num_topics=5, num_words=4):
Uses Latent Dirichlect Allocation for topic modeling
Borrowed from
- Processed_corpus: The corpus of text that has already been tokenized/stemmed/etc.
- num_topics: The number of topics to discover in the text
- num_words: The number of words per topic to print in the output
- A pretty printed list of words in each topic and the probability associated with them
- A dataframe with the topics assigned to each sample
TODO: Print interpretation of the numbers
import gensim
# Additional processing before modeling
dictionary = gensim.corpora.Dictionary(processed_corpus)
corpus = [dictionary.doc2bow(text) for text in processed_corpus]
# Performing LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=num_words)
## Assigning the topics to individual comments
results = pd.DataFrame(processed_corpus) # Putting the series into a dataframe
# Calculating the probability of each topic and assigning the topic with the highest probability
results['Topic'] = processed_corpus.apply(lambda sentence: np.matrix(lda_model[dictionary.doc2bow(sentence)])[:, 1].argmax())
# Calculating the total number of phrases in each topic
number_phrases_per_topic = results['Topic'].value_counts()
# Reporting the topics
for topic in topics:
print('Topic {0} ({1} samples):'.format(topic[0], number_phrases_per_topic.loc[topic[0]])) # Printing the topic number
topic_words = topic[-1].replace('"', '').replace(' ', '') # Removing white space and quotes
topic_words = [word.split('*') for word in topic_words.split('+')] # Splitting into one item for the word and one for the probability
[print(' {0}: {1}'.format(x[1], x[0])) for x in topic_words] # Printing the results as word: probability
print() # New line
return results
You can’t perform that action at this time.