In [2]:
import nltk
import gensim



In [5]:
# Read the data
data = [line.strip() for line in open('reddit_ques.txt', 'r')]

In [8]:
data[:10]

['What are life’s toughest mini games?',
 'What are some slang terms a 50 year old dad can say to his daughter to embarrass her?',
 'Redditors who were in attendance at a wedding that was called off mid-ceremony, what was the story?',
 'What are your best “first date tips” for somebody starting the dating game late in life (late 20’s +)?',
 'Chefs of Reddit, what are the biggest ripoffs that your restaurants sell?',
 'The year is 2050. How do you think you would complete the sentence: "Back in my day, we didn\'t have ..."?',
 'What screams "I\'m emotionally unstable"?',
 'What is an imminent danger that nobody seems to be talking about?',
 'What is the worst purchase you ever made?',
 "What's the WORST name for a strip club you can imagine?"]

In [9]:
# Cleaning and Preprocessiong

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [13]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    pun_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join(lemma.lemmatize(word) for word in pun_free.split())
    
    return normalized

In [14]:
data_clean = [clean(instance).split() for instance in data]

In [16]:
data_clean[:10]

[['life’s', 'toughest', 'mini', 'game'],
 ['slang',
  'term',
  '50',
  'year',
  'old',
  'dad',
  'say',
  'daughter',
  'embarrass',
  'her'],
 ['redditors', 'attendance', 'wedding', 'called', 'midceremony', 'story'],
 ['best',
  '“first',
  'date',
  'tips”',
  'somebody',
  'starting',
  'dating',
  'game',
  'late',
  'life',
  'late',
  '20’s'],
 ['chef', 'reddit', 'biggest', 'ripoffs', 'restaurant', 'sell'],
 ['year',
  '2050',
  'think',
  'would',
  'complete',
  'sentence',
  'back',
  'day',
  'didnt'],
 ['scream', 'im', 'emotionally', 'unstable'],
 ['imminent', 'danger', 'nobody', 'seems', 'talking', 'about'],
 ['worst', 'purchase', 'ever', 'made'],
 ['whats', 'worst', 'name', 'strip', 'club', 'imagine']]

In [17]:
# Document term matrix
from gensim import corpora

In [18]:
# Create the term dictionary of corpus
# Each unique term is assigned an index
dictionary = corpora.Dictionary(data_clean)

In [20]:
# Converting corpora into Document matrix using the dictionary prepared

doc_term_matrix = [dictionary.doc2bow(instance) for instance in data_clean]

In [21]:
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(0, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)],
 [(13, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1)],
 [(44, 1), (45, 1), (46, 1), (47, 1)],
 [(48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1)],
 [(54, 1), (55, 1), (56, 1), (57, 1)],
 [(57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1)],
 [(25, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1)],
 [(72, 1), (73, 1), (74, 1), (75, 1), (76, 1)],
 [(25, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1)],
 [(89, 1), (90, 1), (91, 1), (

In [22]:
# Running LDA model

# Creating the object for LDA model

Lda = gensim.models.ldamodel.LdaModel

In [23]:
# Training the lda model on document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word=dictionary, passes=50)

In [25]:
print(ldamodel.print_topics(num_topics=3))

[(0, '0.019*"whats" + 0.014*"reddit" + 0.011*"would" + 0.009*"like" + 0.008*"why" + 0.008*"movie" + 0.007*"one" + 0.007*"good" + 0.007*"best" + 0.007*"way"'), (1, '0.022*"ever" + 0.015*"reddit" + 0.014*"whats" + 0.013*"thing" + 0.013*"youve" + 0.012*"life" + 0.010*"people" + 0.010*"would" + 0.008*"worst" + 0.008*"know"'), (2, '0.013*"people" + 0.012*"thing" + 0.011*"something" + 0.010*"whats" + 0.010*"best" + 0.009*"one" + 0.009*"movie" + 0.009*"would" + 0.008*"day" + 0.007*"life"')]
