In [0]:
# Install gspread, authenticate and load data from a Google Sheet
!pip install --upgrade -q gspread

from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

# Default data from http://web.eecs.utk.edu/~berry/order/node4.html#SECTION00022000000000000000

googlesheet_filename = '14_topic_modelling_data_en'
data_rows_to_preview = 10


In [6]:
#Load and preview data from a Google Sheet

gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open(googlesheet_filename).sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

# convert the 2nd column values to a list
documents = []
for row in rows[1:]:
  documents.append(row[1])
  
#print(documents)

# Convert to a DataFrame and render.
import pandas as pd
dataset_df = pd.DataFrame.from_records(rows)
dataset_df.head(n=data_rows_to_preview)


Unnamed: 0,0,1
0,id,text
1,1,Linguistics is the scientific study of languag...
2,2,The earliest activities in the documentation a...
3,3,Linguists traditionally analyse human language...
4,4,Phonetics is the study of speech and non-speec...
5,5,"The study of language meaning, on the other ha..."
6,6,While the study of semantics typically concern...
7,7,Grammar is a system of rules which governs the...
8,8,"These rules apply to sound as well as meaning,..."
9,9,Modern theories that deal with the principles ...


In [0]:
#Set topic modeling algorithm arguments

no_topics = 3

no_top_words = 4

no_top_documents = 3

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np

In [8]:
# Run NMF

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
  for topic_idx, topic in enumerate(H):
    print("Topic %d:" % (topic_idx))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
    for doc_index in top_doc_indices:
      print(documents[doc_index])

# from sklearn.feature_extraction import text
# stop_words = text.ENGLISH_STOP_WORDS.union(stopwords)
  
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

print("NMF Topics")
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
print("--------------")



NMF Topics
Topic 0:
language linguistics meaning grammar
The earliest activities in the documentation and description of language have been attributed to the 6th century BC Indian grammarian Pāṇini, who wrote a formal description of the Sanskrit language in his Aṣṭādhyāyī.
Linguistics is the scientific study of language, and involves an analysis of language form, language meaning, and language in context.
Linguists traditionally analyse human language by observing an interplay between sound and meaning. 
Topic 1:
study properties meaning deals
The study of language meaning, on the other hand, deals with how languages encode relations between entities, properties, and other aspects of the world to convey, process, and assign meaning, as well as manage and resolve ambiguity.
Phonetics is the study of speech and non-speech sounds, and delves into their acoustic and articulatory properties.
While the study of semantics typically concerns itself with truth conditions, pragmatics deals with 

In [14]:
# Run LDA

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

print("LDA Topics")
display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)

LDA Topics
Topic 0:
study grammar context production
While the study of semantics typically concerns itself with truth conditions, pragmatics deals with how situational context influences the production of meaning.
Modern theories that deal with the principles of grammar are largely based within Noam Chomsky's framework of generative linguistics.
Phonetics is the study of speech and non-speech sounds, and delves into their acoustic and articulatory properties.
Topic 1:
language meaning study production
Linguistics is the scientific study of language, and involves an analysis of language form, language meaning, and language in context.
The study of language meaning, on the other hand, deals with how languages encode relations between entities, properties, and other aspects of the world to convey, process, and assign meaning, as well as manage and resolve ambiguity.
Grammar is a system of rules which governs the production and use of utterances in a given language.
Topic 2:
sound rules m