In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /home/ajay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/ajay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [5]:
import pandas as pd
data = pd.read_csv('mpst_full_data.csv')

In [69]:
data['plot_synopsis'][0]

'Note: this synopsis is for the orginal Italian release with the segments in this certain order.Boris Karloff introduces three horror tales of the macabre and the supernatural known as the \'Three Faces of Fear\'.THE TELEPHONERosy (Michele Mercier) is an attractive, high-priced Parisian call-girl who returns to her spacious, basement apartment after an evening out when she immediately gets beset by a series of strange phone calls. The caller soon identified himself as Frank, her ex-pimp who has recently escaped from prison. Rosy is terrified for it was her testimony that landed the man in jail. Looking for solace, Rosy phones her lesbian lover Mary (Lynda Alfonsi). The two women have been estranged for some time, but Rosy is certain that she is the only one who can help her. Mary agrees to come over that night. Seconds later, Frank calls again, promising that no matter who she calls for protection, he will have his revenge. Unknown to Rosy, Mary is the caller impersonating Frank. Marry

In [53]:
import random
text_data = []

for line in data['plot_synopsis']:
#     print(line)
    tokens = prepare_text_for_lda(line)
#     if random.random() > .99:
#     print(tokens)
    text_data.append(tokens)

In [54]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [55]:
# corpus

In [57]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.015*"david" + 0.006*"scott" + 0.006*"robin" + 0.006*"arthur" + 0.005*"batman" + 0.005*"chris" + 0.004*"hamlet" + 0.004*"logan" + 0.004*"fight" + 0.004*"return"')
(1, '0.007*"kill" + 0.006*"escape" + 0.004*"force" + 0.004*"order" + 0.004*"attack" + 0.003*"take" + 0.003*"arrive" + 0.003*"agent" + 0.003*"attempt" + 0.003*"shoot"')
(2, '0.011*"tell" + 0.007*"house" + 0.006*"try" + 0.005*"find" + 0.004*"take" + 0.004*"night" + 0.004*"woman" + 0.004*"police" + 0.004*"start" + 0.004*"begin"')
(3, '0.005*"return" + 0.004*"attack" + 0.004*"power" + 0.004*"kill" + 0.004*"become" + 0.004*"world" + 0.003*"escape" + 0.003*"take" + 0.003*"fight" + 0.003*"reveal"')
(4, '0.006*"father" + 0.005*"become" + 0.005*"family" + 0.005*"friend" + 0.004*"return" + 0.004*"mother" + 0.004*"later" + 0.003*"child" + 0.003*"take" + 0.003*"however"')


In [58]:
# doc_lda = ldamodel[corpus]

In [59]:
# doc_lda[0]

In [60]:
sent_topics_df = pd.DataFrame()

# Get main topic in each document
for i, row in enumerate(ldamodel[corpus]):
#     row = sorted(row, key=lambda x: (x[1]), reverse=True)
    # Get the Dominant topic, Perc Contribution and Keywords for each document
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            wp = ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
        else:
            break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

# Add original text to the end of the output
contents = pd.Series(text_data)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)


In [61]:
# df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
df_topic_sents_keywords = sent_topics_df

In [62]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.5724,"tell, house, try, find, take, night, woman, po...","[synopsis, orginal, italian, release, segment,..."
1,1,3.0,0.994,"return, attack, power, kill, become, world, es...","[thousand, years, nhagruul, sorcerer, revel, c..."
2,2,0.0,0.111,"david, scott, robin, arthur, batman, chris, ha...","[matuschek, store, budapest, workplace, alfred..."
3,3,0.0,0.0232,"david, scott, robin, arthur, batman, chris, ha...","[glenn, holland, morning, person, anyone, stan..."
4,4,1.0,0.2531,"kill, escape, force, order, attack, take, arri...","[cuban, name, montana, pacino, claim, asylum, ..."
5,5,2.0,0.6435,"tell, house, try, find, take, night, woman, po...","[george, falconer, colin, firth, approach, acc..."
6,6,2.0,0.6357,"tell, house, try, find, take, night, woman, po...","[baise, tell, story, nadine, violent, spree, s..."
7,7,1.0,0.4834,"kill, escape, force, order, attack, take, arri...","[pratt, jodie, foster, propulsion, engineer, b..."
8,8,0.0,0.0247,"david, scott, robin, arthur, batman, chris, ha...","[small, italian, american, criminal, caesar, e..."
9,9,1.0,0.3733,"kill, escape, force, order, attack, take, arri...","[movie, begin, video, hands, behind, back, enf..."


In [66]:
import pickle

df_dominant_topic.to_pickle('./Topic_modeling_keywords.csv')

In [67]:
Topic_modeling_keywords = pd.read_pickle('./Topic_modeling_keywords.csv')

In [68]:
Topic_modeling_keywords.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.5724,"tell, house, try, find, take, night, woman, po...","[synopsis, orginal, italian, release, segment,..."
1,1,3.0,0.994,"return, attack, power, kill, become, world, es...","[thousand, years, nhagruul, sorcerer, revel, c..."
2,2,0.0,0.111,"david, scott, robin, arthur, batman, chris, ha...","[matuschek, store, budapest, workplace, alfred..."
3,3,0.0,0.0232,"david, scott, robin, arthur, batman, chris, ha...","[glenn, holland, morning, person, anyone, stan..."
4,4,1.0,0.2531,"kill, escape, force, order, attack, take, arri...","[cuban, name, montana, pacino, claim, asylum, ..."


In [19]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.006*"tell" + 0.004*"parry" + 0.004*"mother" + 0.003*"harry"')
(1, '0.007*"scarlett" + 0.005*"steve" + 0.004*"tell" + 0.004*"rhett"')
(2, '0.005*"dragon" + 0.004*"harry" + 0.004*"tell" + 0.004*"beauty"')


In [23]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 1, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=20)
for topic in topics:
    print(topic)

(0, '0.005*"tell" + 0.003*"take" + 0.003*"return" + 0.003*"later" + 0.003*"however" + 0.003*"harry" + 0.003*"kill" + 0.002*"arrive" + 0.002*"night" + 0.002*"become" + 0.002*"father" + 0.002*"find" + 0.002*"try" + 0.002*"escape" + 0.002*"mother" + 0.002*"attack" + 0.002*"police" + 0.002*"scarlett" + 0.002*"friend" + 0.002*"begin"')
