-
Notifications
You must be signed in to change notification settings - Fork 6
/
NLP.py
105 lines (81 loc) · 3.83 KB
/
NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import sys
import time
import re
import nltk
print(time.strftime('%Y/%m/%d %H:%M'))
print('OS: ', sys.platform)
print('CPU Cores:', os.cpu_count())
print('Python: ', sys.version)
print('NLTK: ', nltk.__version__)
### Tokenizing
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') # By blank space
df = df.assign(tokenized_corpus=df['corpus'].map(tokenizer.tokenize)) # If the corpus is a column in a dataframe
### Removing stop words
filtered_words = []
for document in corpus:
filtered_words.append([
word.lower() for word in document
if word.lower() not in nltk.corpus.stopwords.words('english')
])
### Lemmatizing
# Setting the Lemmatization object
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# Looping through the words and appending the lemmatized version to a list
stemmed_words = []
for row in df['tokens']:
stemmed_words.append([
# Verbs
lemmatizer.lemmatize(
# Adjectives
lemmatizer.lemmatize(
# Nouns
lemmatizer.lemmatize(word.lower()), 'a'), 'v')
for word in row
if word.lower() not in nltk.corpus.stopwords.words('english')])
### TF-IDF
# Creating the sklearn object
from sklearn.feature_extraction import text as sktext
tfidf = sktext.TfidfVectorizer(smooth_idf=False)
# Transforming our 'tokens' column into a TF-IDF matrix and then a data frame
tfidf_df = pd.DataFrame(tfidf.fit_transform(corpus).toarray(),
columns=tfidf.get_feature_names())
# Removing sparse columns
tfidf_df = tfidf_df[tfidf_df.columns[tfidf_df.sum() > 2.5]]
# Removing digits
tfidf_df = tfidf_df.filter(regex=r'^((?!\d).)*$')
### Topic modeling
def topic_model_lda(processed_corpus, num_topics=5, num_words=4):
'''
Uses Latent Dirichlect Allocation for topic modeling
Borrowed from https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
Inputs:
- Processed_corpus: The corpus of text that has already been tokenized/stemmed/etc.
- num_topics: The number of topics to discover in the text
- num_words: The number of words per topic to print in the output
Outputs:
- A pretty printed list of words in each topic and the probability associated with them
- A dataframe with the topics assigned to each sample
TODO: Print interpretation of the numbers
'''
import gensim
# Additional processing before modeling
dictionary = gensim.corpora.Dictionary(processed_corpus)
corpus = [dictionary.doc2bow(text) for text in processed_corpus]
# Performing LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=num_words)
## Assigning the topics to individual comments
results = pd.DataFrame(processed_corpus) # Putting the series into a dataframe
# Calculating the probability of each topic and assigning the topic with the highest probability
results['Topic'] = processed_corpus.apply(lambda sentence: np.matrix(lda_model[dictionary.doc2bow(sentence)])[:, 1].argmax())
# Calculating the total number of phrases in each topic
number_phrases_per_topic = results['Topic'].value_counts()
# Reporting the topics
for topic in topics:
print('Topic {0} ({1} samples):'.format(topic[0], number_phrases_per_topic.loc[topic[0]])) # Printing the topic number
topic_words = topic[-1].replace('"', '').replace(' ', '') # Removing white space and quotes
topic_words = [word.split('*') for word in topic_words.split('+')] # Splitting into one item for the word and one for the probability
[print(' {0}: {1}'.format(x[1], x[0])) for x in topic_words] # Printing the results as word: probability
print() # New line
return results