## Settings

In [1]:
# Set number of topics for LSI/LDA
nTopics = 8

# Set subject of the analysis
subject = 'Star Wars Episode IV'

## Import Libraries

In [2]:
# To store data
import pandas as pd

# To do linear algebra
import numpy as np

# To plot graphs
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex

# To create nicer graphs
import seaborn as sns

# To create interactive graphs
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# To vectorize texts
from sklearn.feature_extraction.text import CountVectorizer
# To decompose texts
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
# To visualize high dimensional dataset
from sklearn.manifold import TSNE

# To tag words
from textblob import TextBlob

# To use new datatypes
from collections import Counter

## Load Data

In [3]:
df = pd.read_table('../star-wars-movie-scripts/SW_EpisodeIV.txt', delim_whitespace=True, header=0, escapechar='\\').rename(columns={'dialogue':'text'})
print('DataFrame Shape: {}'.format(df.shape))
df.head()

DataFrame Shape: (1010, 2)


Unnamed: 0,character,text
1,THREEPIO,Did you hear that? They've shut down the main...
2,THREEPIO,We're doomed!
3,THREEPIO,There'll be no escape for the Princess this time.
4,THREEPIO,What's that?
5,THREEPIO,I should have known better than to trust the l...


## Vectorize Texts

In [4]:
# Create vectorizer
countVectorizer = CountVectorizer(stop_words='english')

# Vectorize text
vectorizedText = countVectorizer.fit_transform(df['text'].str.replace("'", '').values)
print('Shape Vectorized Text: {}'.format(vectorizedText.shape))

Shape Vectorized Text: (1010, 1581)


## Plot n Most Frequent Words

In [5]:
# Plot n most frequent words
n = 20



def nMostFrequentWords(n, countVectorizer, vectorizedText):    
    # Count word appearences in text
    vectorizedCount = np.sum(vectorizedText, axis=0)
    
    # Get word indices and counts
    wordIndices = np.flip(np.argsort(vectorizedCount), 1)
    wordCounts = np.flip(np.sort(vectorizedCount),1)

    # Create wordvectors to inverse-transform them
    wordVectors = np.zeros((n, vectorizedText.shape[1]))
    for i in range(n):
        wordVectors[i, wordIndices[0,i]] = 1

    # Inverse-transfrom the wordvectors
    words = [word[0].encode('ascii').decode('utf-8') for word in countVectorizer.inverse_transform(wordVectors)]

    # Return word and word-counts
    return (words, wordCounts[0, :n].tolist()[0])



# Get most frequent words with wordcounts
words, wordCounts = nMostFrequentWords(n=n, countVectorizer=countVectorizer, vectorizedText=vectorizedText)

# Create colormap
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = words,
              y = wordCounts,
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent {} Words In {}'.format(n, subject),
                   xaxis = dict(title = 'Words'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

## Word-Tags

In [6]:
# Tags and descriptions
tag_dict = {'CC':'conjunction, coordinating; and, or, but',
                'CD':'cardinal number; five, three, 13%',
                'DT':'determiner; the, a, these',
                'EX':'existential there; there were six boys',
                'FW':'foreign word; mais',
                'IN':'conjunction, subordinating or preposition; of, on, before, unless',
                'JJ':'adjective; nice, easy',
                'JJR':'adjective, comparative; nicer, easier',
                'JJS':'adjective, superlative; nicest, easiest',
                'LS':'list item marker; ',
                'MD':'verb, modal auxillary; may, should',
                'NN':'noun, singular or mass; tiger, chair, laughter',
                'NNS':'noun, plural; tigers, chairs, insects',
                'NNP':'noun, proper singular; Germany, God, Alice',
                'NNPS':'noun, proper plural; we met two Christmases ago',
                'PDT':'predeterminer; both his children',
                'POS':"possessive ending; 's",
                'PRP':'pronoun, personal; me, you, it',
                'PRP$':'pronoun, possessive; my, your, our',
                'RB':'adverb; extremely, loudly, hard',
                'RBR':'adverb, comparative; better',
                'RBS':'adverb, superlative; best',
                'RP':'adverb, particle; about, off, up',
                'SYM':'symbol; %',
                'TO':'infinitival to; what to do?',
                'UH':'interjection; oh, oops, gosh',
                'VB':'verb, base form; think',
                'VBZ':'verb, 3rd person singular present; she thinks',
                'VBP':'verb, non-3rd person singular present; I think',
                'VBD':'verb, past tense; they thought',
                'VBN':'verb, past participle; a sunken ship',
                'VBG':'verb, gerund or present participle; thinking is fun',
                'WDT':'wh-determiner; which, whatever, whichever',
                'WP':'wh-pronoun, personal; what, who, whom',
                'WP$':'wh-pronoun, possessive; whose, whosever',
                'WRB':'wh-adverb; where, when'}

In [7]:
# Apply tag-function to DataFrame, stack tags and count them
tag_df = pd.DataFrame.from_records(df['text'].apply(lambda x: [tag for word, tag in TextBlob(x).pos_tags]).tolist()).stack().value_counts().reset_index().rename(columns={'index':'tag', 0:'count'})


# Create colormap
n = tag_df.shape[0]
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = tag_df['tag'],
              y = tag_df['count'],
              text = tag_df['tag'].apply(lambda x: tag_dict[x] if x in tag_dict.keys() else x),
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent Tags In {}'.format(subject),
                   xaxis = dict(title = 'Type Of Word'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

## Latent Semantic Indexing/Analysis LSI/LSA

In [8]:
# Create LSI and fit
lsiModel = TruncatedSVD(n_components=nTopics)
lsiTopicMatrix = lsiModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(lsiTopicMatrix.shape))

# Get most probable keys and all categories with counts
lsiKeys = lsiTopicMatrix.argmax(axis=1)
lsiCategories, lsiCounts = zip(*Counter(lsiKeys).items())

Shape LSI Topic Matrix: (1010, 8)


In [9]:
def getTopWords(n, lsiKeys, vectorizedText, countVectorizer):
    # Create empty array for sum
    wordSum = np.zeros((nTopics, vectorizedText.shape[1]))

    # Iterate over the topic of each word
    for i, key in enumerate(lsiKeys):
        # Sum the vectors of each topic
        wordSum[key] += vectorizedText[i]
    
    # Sort and get the most frequent n words for each topic
    topWordsIndices = np.flip(np.argsort(wordSum, axis=1)[:, -n:], axis=1)


    # Store all words for all topics
    topWords = []

    # Iterate over the topics with its indices
    for topic in topWordsIndices:
        # Store all words for one topic
        topicWords = []

        # Iterate over the indices for the topic
        for index in topic:
            # Create a wordvector for the index
            wordVector = np.zeros((vectorizedText.shape[1]))
            wordVector[index] = 1
            # Inverse-transfor the wordvector
            word = countVectorizer.inverse_transform(wordVector)[0][0]
            # Store the word
            topicWords.append(word.encode('ascii').decode('utf-8'))
        # Store all words for the topic
        topWords.append(', '.join(topicWords))

    return topWords

In [10]:
# Get top n words
topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: going, dont, know, youre, think
Topic 1: im, going, level, detention, trying
Topic 2: wan, obi, help, kenobi, ben
Topic 3: luke, hold, force, pull, hurry
Topic 4: dont, sir, forget, worry, force
Topic 5: right, got, ive, ill, artoo
Topic 6: come, red, threepio, standing, leader
Topic 7: ship, station, rebel, time, power


In [11]:
# Sort data
lsiCategoriesSorted, lsiCountsSorted = zip(*sorted(zip(lsiCategories, lsiCounts)))

# Create labels
topWords = getTopWords(3, lsiKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in lsiCategoriesSorted]

# Create colormap
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = labels,
              y = lsiCountsSorted,
              text = topWords,
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LSI Topics In {}'.format(subject),
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [12]:
# Transform high dimensional dataset to visualize in 2D
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(lsiTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1010 samples in 0.002s...
[t-SNE] Computed neighbors for 1010 samples in 0.103s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1010
[t-SNE] Computed conditional probabilities for sample 1010 / 1010
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.303593
[t-SNE] Error after 2000 iterations: 0.497552


In [13]:
def getMeanVectors(keys, twoDVectors):
    # Store vectorsum
    meanTopicVectors = np.zeros((nTopics, 2))
    # Store vectorcoutn
    topicCount = np.zeros(nTopics)

    # Iterate over each key-vector pair
    for key, tsneVector in zip(keys, twoDVectors):
        # Sum and count the vectors
        meanTopicVectors[key] += tsneVector
        topicCount[key] += 1

    # Return mean of the vectors
    return meanTopicVectors / topicCount[:,None]

In [14]:
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, lsiKeys, vectorizedText, countVectorizer)
# Compute centers for topics
lsiMeanTopicVectors = getMeanVectors(lsiKeys, tsneModelVectors)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = lsiKeys==topic
    scatter = go.Scatter(x = tsneModelVectors[mask,0],
                         y = tsneModelVectors[mask,1],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['text'],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LSI Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

## Latent Dirichlet Allocation

In [15]:
# Create LDA and fit
ldaModel = LatentDirichletAllocation(n_components=nTopics, learning_method='online', random_state=0, verbose=0)
ldaTopicMatrix = ldaModel.fit_transform(vectorizedText)
print('Shape LSI Topic Matrix: {}'.format(ldaTopicMatrix.shape))

# Get most probable keys and all categories with counts
ldaKeys = ldaTopicMatrix.argmax(axis=1)
ldaCategories, ldaCounts = zip(*Counter(ldaKeys).items())

Shape LSI Topic Matrix: (1010, 8)


In [16]:
# Get top n words
topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)

# Print the topics and its words
for i, words in enumerate(topWords):
    print('Topic {}: {}'.format(i, words))

Topic 0: come, dont, red, good, theres
Topic 1: hold, im, star, way, speed
Topic 2: threepio, yes, wait, artoo, sand
Topic 3: target, beam, ben, tractor, kenobi
Topic 4: luke, going, im, youre, think
Topic 5: station, hey, battle, away, vader
Topic 6: know, dont, hes, sir, oh
Topic 7: right, im, ill, ship, want


In [17]:
# Sort data
ldaCategoriesSorted, ldaCountsSorted = zip(*sorted(zip(ldaCategories, ldaCounts)))

# Create labels
topWords = getTopWords(3, ldaKeys, vectorizedText, countVectorizer)
labels = ['Topic {}'.format(i) for i in ldaCategoriesSorted]

# Create colormap
n = nTopics
cmap = get_cmap('viridis')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Create plot
data = go.Bar(x = labels,
              y = ldaCountsSorted,
              text = topWords,
              marker = dict(color = colors))

layout = go.Layout(title = 'Most Frequent LDA Topics In {}'.format(subject),
                   xaxis = dict(title = 'Topic'),
                   yaxis = dict(title = 'Count'))

fig = go.Figure(data=[data], layout=layout)
iplot(fig)

In [18]:
# Transform high dimensional dataset to visualize in 2D
tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsneModelVectors = tsneModel.fit_transform(ldaTopicMatrix)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1010 samples in 0.002s...
[t-SNE] Computed neighbors for 1010 samples in 0.125s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1010
[t-SNE] Computed conditional probabilities for sample 1010 / 1010
[t-SNE] Mean sigma: 0.099793
[t-SNE] KL divergence after 250 iterations with early exaggeration: 48.311043
[t-SNE] Error after 1900 iterations: 0.251934


In [19]:
# Create colormap
n = nTopics
cmap = get_cmap('tab10')
colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]

# Get n top words
topWords = getTopWords(3, ldaKeys, vectorizedText, countVectorizer)
# Compute centers for topics
ldaMeanTopicVectors = getMeanVectors(ldaKeys, tsneModelVectors)


# Create plot
data = []
# Iterate over each topic
for topic in range(nTopics):
    # Mask for a single topic
    mask = ldaKeys==topic
    scatter = go.Scatter(x = tsneModelVectors[mask,0],
                         y = tsneModelVectors[mask,1],
                         name = 'Topic {}: {}'.format(topic, topWords[topic]),
                         mode = 'markers',
                         text = df[mask]['text'],
                         marker = dict(color = colors[topic]))
    data.append(scatter)

layout = go.Layout(title = 't-SNE Clustering of {} LDA Topics'.format(nTopics),
                   showlegend=True,
                   hovermode = 'closest')

fig = go.Figure(data=data, layout=layout)
iplot(fig)