In [1]:
from datascience import *
import numpy as np
import re
import gensim

from collections import Counter

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
logging.root.level = logging.CRITICAL 

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# direct plots to appear within the cell, and set their style
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [2]:
filename = "https://s3.amazonaws.com/sds171/labs/lab07/ted_talks.csv"
data = Table.read_table(filename)

transcripts = data.column('transcript')

In [3]:
#Using regular expression to clean the data
transcripts = [re.sub('-', ' ', plot) for plot in transcripts]
transcripts = [re.sub('[^\w\s]', '', plot) for plot in transcripts]
transcripts = [re.sub('[A-Z]\w*', '', plot) for plot in transcripts]
transcripts = [re.sub('[ ]+', ' ', plot) for plot in transcripts]

In [4]:
def is_numeric(string):
    for char in string:
        if char.isdigit():
            return True
    return False

def has_poss_contr(string):
    for i in range(len(string) - 1):
        if string[i] == '\'' and string[i+1] == 's':
            return True
    return False

def empty_string(string):
    return string == ''

def remove_string(string):
    return is_numeric(string) | has_poss_contr(string) | empty_string(string)

In [5]:
#Tokenize
plots_tok = []
for plot in transcripts:
    processed = plot.lower().strip().split(' ')
    plots_tok.append(processed)

#Removing numeric, posessives/contractions, and empty strings
temp = []
for plot in plots_tok:
    filtered = []
    for token in plot:
        if not remove_string(token):
            filtered.append(token)
    temp.append(filtered)
plots_tok = temp

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

#Lemmatizing the tokens 
lemmatizer = WordNetLemmatizer()

temp = []
for plot in plots_tok:
    processed = []
    for token in plot:
        processed.append(lemmatizer.lemmatize(token, pos='v'))
    temp.append(processed)
plots_tok = temp

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michaelchau/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
#Creating the Counter
vocab = Counter()
for plot in plots_tok:
    vocab.update(plot)

print("Number of unique tokens: %d" % len(vocab))

Number of unique tokens: 33616


In [8]:
#Keeping tokens that appear more than 20 times 
tokens = []
for token in vocab.elements():
    if vocab[token] > 20:
        tokens.append(token)
vocab = Counter(tokens)

print("Number of unique tokens: %d" % len(vocab))

Number of unique tokens: 7101


In [9]:
#Removing rare and stop words
stop_words = []
for item in vocab.most_common(200):
    stop_word = item[0]
    stop_words.append(stop_word)
tokens = []
for token in vocab.elements():
    if token not in stop_words:
        tokens.append(token)
vocab = Counter(tokens)

print("Number of unique tokens: %d" % len(vocab))

Number of unique tokens: 6901


In [10]:
#Creating the identifier mappings word2id and id2word
items = vocab.items()
id2word = {}
word2id = {}
idx = 0
for word, count in vocab.items():
    id2word[idx] = word
    word2id[word] = idx
    idx += 1
    
print("Number of tokens mapped: %d" % len(id2word))
print("Identifier for 'photograph': %d" % word2id['photograph'])
print("Word for identifier %d: %s" % (word2id['photograph'], id2word[word2id['photograph']]))

Number of tokens mapped: 6901
Identifier for 'photograph': 2252
Word for identifier 2252: photograph


In [11]:
#Filtering the tokens 
temp = []
for plot in plots_tok:
    filtered = []
    for token in plot:
        if token in vocab:
            filtered.append(token)
    temp.append(filtered)
plots_tok = temp

In [12]:
#Creating the Corpus
sample = 30
corpus = []
for plot in plots_tok:
    plot_count = Counter(plot)
    corpus_doc = []
    for item in plot_count.items():
        pair = (word2id[item[0]], item[1])
        corpus_doc.append(pair)
    corpus.append(corpus_doc)

print("Plot, tokenized:\n", plots_tok[sample], "\n")
print("Plot, in corpus format:\n", corpus[sample])

Plot, tokenized:
 ['stuff', 'book', 'mine', 'hope', 'resonate', 'youve', 'already', 'connections', 'myself', 'case', 'miss', 'official', 'official', 'official', 'industrial', 'societies', 'official', 'run', 'maximize', 'welfare', 'citizens', 'maximize', 'individual', 'freedom', 'reason', 'both', 'freedom', 'itself', 'valuable', 'worthwhile', 'essential', 'freedom', 'act', 'maximize', 'welfare', 'decide', 'behalf', 'maximize', 'freedom', 'maximize', 'choice', 'choice', 'freedom', 'freedom', 'welfare', 'deeply', 'embed', 'water', 'supply', 'wouldnt', 'occur', 'anyone', 'deeply', 'embed', 'examples', 'modern', 'progress', 'possible', 'supermarket', 'such', 'word', 'salad', 'dress', 'salad', 'dress', 'supermarket', 'count', 'extra', 'virgin', 'olive', 'oil', 'buy', 'large', 'number', 'salad', 'dress', 'chance', 'none', 'store', 'offer', 'suit', 'supermarket', 'consumer', 'electronics', 'store', 'set', 'stereo', 'speakers', 'player', 'tape', 'player', 'single', 'consumer', 'electronics', 's

In [13]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

CPU times: user 1min 5s, sys: 532 ms, total: 1min 6s
Wall time: 36 s


In [14]:
num_topics = 10
num_words = 15
top_words = Table().with_column('word rank', np.arange(1,num_words+1))
for k in np.arange(num_topics): 
    topic = lda_model.get_topic_terms(k, num_words)
    words = [id2word[topic[i][0]] for i in np.arange(num_words)]
    probs = [topic[i][1] for i in np.arange(num_words)]
    top_words = top_words.with_column('topic %d' % k, words)
    
top_words.show()

word rank,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
1,company,planet,social,play,cancer,guy,water,data,space,women
2,money,species,political,sound,health,write,energy,technology,image,children
3,countries,animals,group,game,body,word,plant,machine,art,men
4,dollars,fish,power,music,cells,ever,food,computer,light,family
5,country,water,believe,body,disease,spend,climate,information,experience,young
6,market,tree,against,video,drug,run,carbon,science,paint,man
7,city,ocean,between,hand,patients,bite,grow,example,project,parent
8,global,star,reason,listen,blood,person,air,model,city,mother
9,cities,planets,government,arm,medical,too,solar,image,beautiful,woman
10,million,sea,may,head,doctor,read,gas,process,color,home


In [15]:
sample = 13
topic_dist = lda_model.get_document_topics(corpus[sample], minimum_probability = 0)
topics = [pair[0] for pair in topic_dist] 
probabilities = [pair[1] for pair in topic_dist]
topic_dist_table = Table().with_columns('Topic', topics, 'Probabilities', probabilities)
topic_dist_table.show(20)
t = np.argmax(probabilities)
print("Topic with highest probability: %d (%f)" % (t, probabilities[t]))

Topic,Probabilities
0,0.00198656
1,0.00127246
2,0.0105815
3,0.205505
4,0.000858629
5,0.0503681
6,0.0108479
7,0.646343
8,0.0696848
9,0.00255206


Topic with highest probability: 7 (0.646343)


In [16]:
print(transcripts[sample][0:2500])

 really excited to be here today show you some stuff thats just ready to come out of the lab literally and really glad that you guys are going to be among the first to see it in person because really think this is going to really change the way we interact with machines from this point on this is a rear projected drafting table about 36 inches wide and its equipped with a multi touch sensor touch sensors that you see like on a kiosk or interactive whiteboards can only register one point of contact at a time thing allows you to have multiple points at the same time can use both my hands can use chording actions can just go right up and use all 10 fingers if wanted to know like that multi touch sensing isnt completely new like have been playing around with it in the 80s the approach built here is actually high resolution low cost and probably most importantly very scalable the technology you know isnt the most exciting thing here right now other than probably its newfound accessibility r

In this example, Topic 7, which represents technology, has the highest probability with .646. Looking at the transcript of the talk, we see that this is in fact true. In the transcript of this sample, we see terms like "screensaver", "touch sensor", and "interactive".

In [17]:
sample = 7
topic_dist = lda_model.get_document_topics(corpus[sample], minimum_probability = 0)
probabilities = [pair[1] for pair in topic_dist]
topics = [pair[0] for pair in topic_dist]
topic_dist_table = Table().with_columns('Topic', topics, 'Probabilities', probabilities)
topic_dist_table.show(20)
t = np.argmax(probabilities)
print("Topic with highest probability: %d (%f)" % (t, probabilities[t]))

Topic,Probabilities
0,0.172373
1,0.0142661
2,0.0366336
3,0.000343127
4,0.000314115
5,0.0978113
6,0.000276459
7,0.159664
8,0.514234
9,0.00408484


Topic with highest probability: 8 (0.514234)


In [18]:
print(transcripts[sample][0:2500])

 going to present three projects in rapid fire dont have much time to do it want to reinforce three ideas with that rapid fire presentation first is what like to call a hyper rational process a process that takes rationality almost to an absurd level and it transcends all the baggage that normally comes with what people would call sort of a rational conclusion to something it concludes in something that you see here that you actually wouldnt expect as being the result of rationality second the second is that this process does not have a signature is no authorship are obsessed with authorship is something that has editing and it has teams but in fact we no longer see within this process the traditional master architect creating a sketch that his minions carry out the third is that it challenges and this is in the length of this very hard to support why connect all these things but it challenges the high modernist notion of flexibility modernists said we will create sort of singular spac

In this sample, we observe that topic 8, which represents art, has the highest probability with .51. Looking at the transcript, we can see that our topic model is correct since there are terms like "modernists", "design", and "diagram".

In [19]:
sample = 31
topic_dist = lda_model.get_document_topics(corpus[sample], minimum_probability = 0)
probabilities = [pair[1] for pair in topic_dist]
topics = [pair[0] for pair in topic_dist]
topic_dist_table = Table().with_columns('Topic', topics, 'Probabilities', probabilities)
topic_dist_table.show(20)
t = np.argmax(probabilities)
print("Topic with highest probability: %d (%f)" % (t, probabilities[t]))

Topic,Probabilities
0,0.00560299
1,0.0331117
2,0.00944252
3,0.00036272
4,0.641746
5,0.139144
6,0.0233119
7,0.098927
8,0.00981698
9,0.0385338


Topic with highest probability: 4 (0.641746)


In [20]:
print(transcripts[sample][0:2500])

 you really an honor and a privilege to be here spending my last day as a teenager want to talk to you about the future but first going to tell you a bit about the past story starts way before was born grandmother was on a train to the death camp she was going along the tracks and the tracks split somehow we dont really know exactly the whole story but the train took the wrong track and went to a work camp rather than the death camp grandmother survived and married my grandfather were living in and my mother was born when my mother was two years old the revolution was raging and they decided to escape got on a boat and yet another divergence the boat was either going to or to got on and didnt know where they were going and ended up in to make a long story short they came to grandmother was a chemist worked at the in and at 44 she died of stomach cancer never met my grandmother but carry on her name her exact name and like to think carry on her scientific passion too found this passion 

The topic with the highest probability of .64 is topic 4, which represents medicine. Looking at the transcript of the talk, we observe that our topic model was able to correctly identify the topic at hand. Terms like "cancer", "medical", and "research" were all used in this TED talk.