In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import os
from nltk.corpus import stopwords

# lda
import lda
import lda.datasets

# Gsdmm
from gsdmm import MovieGroupProcess


# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint



import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocessing():
    folder_path = ['comments1k/0_9.txt', 'comments1k/1_7.txt']

    # create an empty list to store the data
    data = []

    # loop through all files in the folder
    for filename in folder_path:
            # read the file
            with open(filename, "r") as f:
                content = f.read()
            
            # append the data to the list
            data.append({"data": content})

    # create a DataFrame from the data
    df = pd.DataFrame(data)

    # display the DataFrame
    df.head()
    df['data'] = df['data'].str.replace('&\w+;','')
    df['data'] = df['data'].apply(lambda x: re.sub('<.*?>', '', x))

    df['data'] = df['data'].str.lower()

    df['data'] = df['data'].str.replace('[^\w\s]',' ')
    # Load the stop words from NLTK
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    df['data'] = df['data'].apply(lambda x:' '.join([w for w in x.split() if w not in stop_words]))

    return df



In [3]:
X

NameError: name 'X' is not defined

In [None]:
df = preprocessing()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saima_x4lzx52\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def lda(df):
    import lda.datasets
    X = lda.datasets.load_reuters()
    vocab = lda.datasets.load_reuters_vocab()
    titles = df['data'].tolist()
    model = lda.LDA(n_topics=10, n_iter=1500, random_state=1)
    model.fit(X) # model.fit_transform(X) is also available
    topic_word = model.components_ # model.components_ also works
    n_top_words = 8
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    doc_topic = model.doc_topic_
    for i in range(2):
        print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

Question 1.1 - Use  Latent  Dirichlet  Allocation  (LDA)  method  to  discover  latent  topics  in  the  dataset  with  the number  of  topics  as  10.  Output  the  top  8  words  for  each  topic.  For  the  document  “0_9.txt”  and “1_7.txt”, what topics are assigned to them? Do they make sense?

In [None]:
df = preprocessing()
X = lda(df)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saima_x4lzx52\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -958627
INFO:lda:<10> log likelihood: -718617
INFO:lda:<20> log likelihood: -699619
INFO:lda:<30> log likelihood: -691378
INFO:lda:<40> log likelihood: -685783
INFO:lda:<50> log likelihood: -682819
INFO:lda:<60> log likelihood: -680314
INFO:lda:<70> log likelihood: -678535
INFO:lda:<80> log likelihood: -676979
INFO:lda:<90> log likelihood: -676090
INFO:lda:<100> log likelihood: -675970
INFO:lda:<110> log likelihood: -674583
INFO:lda:<120> log likelihood: -674216
INFO:lda:<130> log likelihood: -673307
INFO:lda:<140> log likelihood: -672442
INFO:lda:<150> log likelihood: -671950
INFO:lda:<160> log likelihood: -670999
INFO:lda:<170> log likelihood: -670513
INFO:lda:<180> log li

1.1 Answer:

- Topic 0: Crime (police, Miami, Versace, Cunanan)
- Topic 1: Music (Elvis, film, music, fans, festival, concert)
- Topic 2: Politics (Yeltsin, president, Russian, political, Russia, minister, Kremlin)
- Topic 3: Art and Culture (city, million, century, art, exhibition, museum, cultural, sale)
- Topic 4: Royalty (Charles, prince, king, Diana, royal, queen, family, Parker)
- Topic 5: War and Politics (against, Germany, Catholic, French, government, war, German, rights)
- Topic 6: Education (school, teachers, profession, students, student, whole situation, schools, episode)
- Topic 7: Religion (Pope, Mother Teresa, Vatican, order, hospital, doctors, Catholic)
- Topic 8: Politics and Diplomacy (Harriman, US, Clinton, Churchill, ambassador, president, east, Paris)
- Topic 9: Death and Religion (died, church, former, Bernardin, death, funeral, life, Simpson)

**Using the top words "high," "comedy," "cartoon," "teachers," and "students," the model allocated it to topic 4. The material appears to be a review or summary of the animated comedy television series "Bromwell High," which centers on a high school and its faculty.**


Question 1.2: Because  of  the  data  sparsity,  short  text  may  not  provide  enough  context  to  adequately  inform  topic modeling.  Try  Biterm,  GSDMM  or  other  short  text  topic  model  for  our  dataset.  Compare  the  topic 
modelling results with LDA, any improvement?

In [None]:
docs = df.data.to_numpy()

# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(docs)

# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

# create BOW dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

# initialize GSDMM
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15)

# fit GSDMM model
y = gsdmm.fit(docs, vocab_length)

doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 20)


INFO:gensim.corpora.dictionary:adding document #0 to Dictionary<0 unique tokens: []>


TypeError: doc2bow expects an array of unicode tokens on input, not a single string