In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import feature_extraction

import re

import nltk
from nltk.tokenize import word_tokenize

# for wordnet (synonyms, antonyms, related words, etc)
nltk.download('wordnet')
from nltk.corpus import wordnet

# for finding root words
from nltk.stem.wordnet import WordNetLemmatizer

# for filtering out stop words
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

from gensim import corpora
import gensim
import pickle

from sqlalchemy import create_engine
engine = create_engine('sqlite:///db/%s' % 'data.db')

[nltk_data] Downloading package wordnet to /home/michael/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/michael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
notes = pd.read_sql_table(table_name = "notebooks", con=engine, index_col="index")

In [3]:
def lemmatize(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def word_net_lemmatize(word):
    return WordNetLemmatizer().lemmatize(word)

def generate_lda_tokens(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [lemmatize(token) for token in tokens]
    tokens = [word_net_lemmatize(token) for token in tokens]

    return tokens

In [4]:
training_content_data = []

for index, row in notes.iterrows():
    training_content_data.append(generate_lda_tokens(row.Content))

dictionary = corpora.Dictionary(training_content_data)
corpus = [dictionary.doc2bow(text) for text in training_content_data]

pickle.dump(corpus, open('lda/corpus.pkl', 'wb'))
dictionary.save('lda/dictionary.gensim')

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics = 150, 
                                           id2word=dictionary, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=30,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=False)
ldamodel.save('lda/model.gensim')
topics = ldamodel.print_topics(num_words=4)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel, texts=training_content_data, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nLDA Coherence Score: ', coherence_lda)

In [10]:
for index, row in notes.iterrows():
    notebooks = []
    note_tokens = generate_lda_tokens(row.Content)
    bow = dictionary.doc2bow(note_tokens)
    topics = []
    for topic_id, score in ldamodel.get_document_topics(bow):
        topics.append((topic_id, score))
        notes.at[index, str(topic_id)] = score
    print(row["Class Name"], topics)

Information Visualization [(10, 0.96560895)]
Information Visualization [(17, 0.9936912)]
Information Visualization [(5, 0.9821746)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560243)]
Information Visualization [(3, 0.96258944)]
Information Visualization [(10, 0.98504335)]
Information Visualization [(2, 0.9709649)]
Information Visualization [(32, 0.95539325)]
Information Visualization [(20, 0.97921455)]
Information Visualization [(19, 0.9718613)]
Information Visualization [(3, 0.9735606)]
Information Visualization [(10, 0.96560746)]
Information Visualization [(1, 0.9860904)]
Information Visualization [(27, 0.9052676)]
Information Visualization [(21, 0.9621717)]
Information Visualization [(15, 0.3726336), (34, 0.603449)]
Information Visualization [(11, 0.9689286)]
Information Visualization [(25, 0.9666982)]
Information Visualization [(14, 0.89415413)]
Information Visualization [(25, 0.58169997), (32, 0.391523)]
Information Law & Policy [(18, 0.98307484

Information Visualization [(5, 0.9821746)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560242)]
Information Visualization [(3, 0.96258944)]
Information Visualization [(10, 0.98504335)]
Information Visualization [(2, 0.9709649)]
Information Visualization [(32, 0.9553933)]
Information Visualization [(20, 0.97921455)]
Information Visualization [(19, 0.9718613)]
Information Visualization [(3, 0.9735605)]
Information Visualization [(10, 0.96560746)]
Information Visualization [(1, 0.98609054)]
Information Visualization [(27, 0.9052676)]
Information Visualization [(21, 0.9621717)]
Information Visualization [(15, 0.37263337), (34, 0.6034492)]
Information Visualization [(11, 0.9689286)]
Information Visualization [(25, 0.9666982)]
Information Visualization [(14, 0.89414513)]
Information Visualization [(25, 0.5817005), (32, 0.3915224)]
Information Law & Policy [(18, 0.9830749)]
Information Law & Policy [(16, 0.75491923), (22, 0.22788894)]
Information Law & Polic

Applied Behavioral Economics [(2, 0.033911917), (3, 0.038288552), (7, 0.014955227), (15, 0.055362172), (17, 0.04011464), (18, 0.06259949), (22, 0.038912665), (26, 0.4235589), (29, 0.18792872), (30, 0.024069024), (32, 0.034636516), (33, 0.023259373)]
Information Visualization [(10, 0.96560895)]
Information Visualization [(17, 0.9936912)]
Information Visualization [(5, 0.98217463)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560243)]
Information Visualization [(3, 0.96258944)]
Information Visualization [(10, 0.9850432)]
Information Visualization [(2, 0.9709649)]
Information Visualization [(32, 0.95539325)]
Information Visualization [(20, 0.97921455)]
Information Visualization [(19, 0.9718613)]
Information Visualization [(3, 0.9735606)]
Information Visualization [(10, 0.96560746)]
Information Visualization [(1, 0.98609054)]
Information Visualization [(27, 0.9052676)]
Information Visualization [(21, 0.9621717)]
Information Visualization [(15, 0.37263346),

Applied Machine Learning [(3, 0.017622186), (6, 0.022669213), (7, 0.040741548), (9, 0.03543915), (10, 0.013493843), (11, 0.01811921), (15, 0.039960667), (17, 0.08585796), (22, 0.052373774), (23, 0.14376242), (26, 0.44203404), (28, 0.033166416), (29, 0.017776929)]
Applied Behavioral Economics [(2, 0.033910997), (3, 0.038288526), (7, 0.014954994), (15, 0.055362348), (17, 0.040114444), (18, 0.06259944), (22, 0.0389129), (26, 0.42355886), (29, 0.18792875), (30, 0.024069034), (32, 0.034636505), (33, 0.0232594)]
Information Visualization [(10, 0.96560895)]
Information Visualization [(17, 0.9936912)]
Information Visualization [(5, 0.9821746)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560242)]
Information Visualization [(3, 0.96258944)]
Information Visualization [(10, 0.98504305)]
Information Visualization [(2, 0.9709649)]
Information Visualization [(32, 0.9553933)]
Information Visualization [(20, 0.97921455)]
Information Visualization [(19, 0.9718613)]
Inf

Applied Machine Learning [(1, 0.0140422005), (3, 0.022319201), (6, 0.013731085), (7, 0.03452245), (8, 0.47004375), (15, 0.016397515), (17, 0.06974192), (22, 0.11972669), (23, 0.11737186), (29, 0.079309896), (30, 0.012636358)]
Applied Machine Learning [(3, 0.017621716), (6, 0.022669166), (7, 0.040739216), (9, 0.035439044), (10, 0.01349379), (11, 0.01811884), (15, 0.03996639), (17, 0.08585884), (22, 0.052370906), (23, 0.14376244), (26, 0.44203356), (28, 0.033165768), (29, 0.017776927)]
Applied Behavioral Economics [(2, 0.033917204), (3, 0.03828889), (7, 0.014956768), (15, 0.055361688), (17, 0.040115844), (18, 0.06259956), (22, 0.03891288), (26, 0.4235589), (29, 0.18792872), (30, 0.024068985), (32, 0.034636546), (33, 0.023259634)]
Information Visualization [(10, 0.96560895)]
Information Visualization [(17, 0.9936912)]
Information Visualization [(5, 0.9821746)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560243)]
Information Visualization [(3, 0.96258944)

Information Visualization [(17, 0.9936912)]
Information Visualization [(5, 0.98217463)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560242)]
Information Visualization [(3, 0.96258944)]
Information Visualization [(10, 0.98504317)]
Information Visualization [(2, 0.9709649)]
Information Visualization [(32, 0.9553933)]
Information Visualization [(20, 0.97921455)]
Information Visualization [(19, 0.9718613)]
Information Visualization [(3, 0.9735606)]
Information Visualization [(10, 0.96560746)]
Information Visualization [(1, 0.98609054)]
Information Visualization [(27, 0.90526766)]
Information Visualization [(21, 0.9621717)]
Information Visualization [(15, 0.37263313), (34, 0.60344946)]
Information Visualization [(11, 0.9689286)]
Information Visualization [(25, 0.9666982)]
Information Visualization [(14, 0.89415455)]
Information Visualization [(25, 0.58170044), (32, 0.39152253)]
Information Law & Policy [(18, 0.9830747)]
Information Law & Policy [(16, 0.754

Applied Machine Learning [(3, 0.017621402), (6, 0.022669123), (7, 0.04073825), (9, 0.03543932), (10, 0.013493815), (11, 0.018118968), (15, 0.039967064), (17, 0.08585939), (22, 0.052370552), (23, 0.14376242), (26, 0.44203362), (28, 0.033165157), (29, 0.017776916)]
Applied Behavioral Economics [(2, 0.03391547), (3, 0.038288847), (7, 0.014956496), (15, 0.05536207), (17, 0.040115416), (18, 0.06259938), (22, 0.038912762), (26, 0.4235589), (29, 0.18792872), (30, 0.02406899), (32, 0.034636527), (33, 0.023259707)]
Information Visualization [(10, 0.96560895)]
Information Visualization [(17, 0.9936912)]
Information Visualization [(5, 0.9821746)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560242)]
Information Visualization [(3, 0.96258944)]
Information Visualization [(10, 0.98504317)]
Information Visualization [(2, 0.9709649)]
Information Visualization [(32, 0.9553933)]
Information Visualization [(20, 0.97921455)]
Information Visualization [(19, 0.9718613)]
Inf

Applied Machine Learning [(1, 0.014040973), (3, 0.022319108), (6, 0.013730682), (7, 0.03452249), (8, 0.47004357), (15, 0.016396696), (17, 0.06974215), (22, 0.11972684), (23, 0.11737185), (29, 0.07930988), (30, 0.012636321)]
Applied Machine Learning [(3, 0.017621592), (6, 0.022669235), (7, 0.040739086), (9, 0.035439327), (10, 0.01349381), (11, 0.018119002), (15, 0.03996569), (17, 0.08585898), (22, 0.05237124), (23, 0.14376242), (26, 0.44203365), (28, 0.033165473), (29, 0.017776916)]
Applied Behavioral Economics [(2, 0.03391127), (3, 0.038288545), (7, 0.014954835), (15, 0.055362407), (17, 0.040114563), (18, 0.0625993), (22, 0.038912784), (26, 0.4235589), (29, 0.18792872), (30, 0.024069024), (32, 0.034636494), (33, 0.023259526)]
Information Visualization [(10, 0.96560895)]
Information Visualization [(17, 0.9936912)]
Information Visualization [(5, 0.98217463)]
Information Visualization [(6, 0.98840064)]
Information Visualization [(31, 0.9560242)]
Information Visualization [(3, 0.96258944)]

Needs and Usability [(3, 0.051139954), (7, 0.031604193), (22, 0.14706035), (26, 0.40253466), (27, 0.030082965), (29, 0.24182923), (31, 0.015562719), (33, 0.06287204)]
Needs and Usability [(0, 0.113440596), (2, 0.13371468), (17, 0.08389365), (23, 0.13086043), (26, 0.44659767)]
Needs and Usability [(3, 0.078050025), (29, 0.8087202)]
Applied Machine Learning [(2, 0.017811526), (7, 0.057921633), (8, 0.48938376), (15, 0.042543672), (17, 0.05721952), (22, 0.091285504), (23, 0.16849798), (26, 0.036379132)]
Applied Machine Learning [(2, 0.016694497), (3, 0.012914748), (7, 0.160506), (15, 0.013409477), (17, 0.1069394), (19, 0.017279949), (23, 0.124355115), (26, 0.52260727)]
Applied Machine Learning [(1, 0.0140384305), (3, 0.022319064), (6, 0.01373028), (7, 0.03452238), (8, 0.47004327), (15, 0.016395656), (17, 0.0697428), (22, 0.11972741), (23, 0.11737181), (29, 0.07930992), (30, 0.012636306)]
Applied Machine Learning [(3, 0.017620979), (6, 0.022669181), (7, 0.040736947), (9, 0.035439696), (10, 

In [11]:
notes.head()

Unnamed: 0_level_0,Date of Creation,Author,Class Name,Title,Content,0,1,2,3,4,...,25,26,27,28,29,30,31,32,33,34
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,01/17/2018,Michelle Chen,Information Visualization,Introduction,What is visualization?\t\r\n•\tThe visual repr...,0.0,0.0,0.0,0.998267,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,01/20/2018,Michelle Chen,Information Visualization,Form and Function,Readings\n \n1. Cairo Ch 2 and 4.\n \n•...,0.486963,0.0,0.250513,0.26237,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,01/23/2018,Michelle Chen,Information Visualization,"Clarify, Don't Simplify",\r\nReadings\r\n1.\tCairo Ch 8.\r\n•\tChapter ...,0.0,0.999014,0.0,0.190599,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.031244,0.0,0.0,0.0
3,01/27/2018,Michelle Chen,Information Visualization,Cognitive Principles,Class Notes\r\n•\tVisual Encoding\r\n○\t3 main...,0.0,0.160584,0.83899,0.038027,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,01/31/2018,Michelle Chen,Information Visualization,Visual Vocabulary,Readings Few Ch. 3\r\n\r\nClass Notes\r\nData ...,0.0,0.0,0.997853,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.956024,0.0,0.0,0.0


In [17]:
notes.to_sql('notebooks', con=engine, if_exists="append")