In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import gensim

from wordcloud import WordCloud
from gensim import corpora
from gensim import models
from gensim.corpora.dictionary import Dictionary

from collections import defaultdict
from gensim.parsing.preprocessing import STOPWORDS

In [2]:
import string
import logging

In [3]:
dfname = 'FP4'

df1 = pd.read_pickle('dfs/' + dfname)
df = df1[['title','objective']]
df = df.dropna(how='any')
df['merged'] = df['title'] + ' ' + df['objective']

In [4]:
objectives = df['merged']
# objectives = pd.concat([ df['objective'] for df in [df4, df5, df6, df7, df20] ])

In [5]:
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])

objectives = objectives.str.lower().str.replace('%l', '').str.replace(RE_PUNCTUATION, ' ')

In [6]:
objectives_split = objectives.str.strip().str.split()
objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if len(token) > 2])
objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if not(token.isdigit())])
objectives_split.head(2)

0                                       [spot, tation]
1    [formation, and, occurrence, nitrous, acd, the...
Name: merged, dtype: object

In [7]:
list_stopwords = ['data','will', 'develop', 'development', 'project', 'research', 'new', 'use', 'europe', 'european', 'based']
if dfname == 'FP4':
    list_stopwords.append('des')

additional_stopwords = set(list_stopwords)
stopwords = set(STOPWORDS) | additional_stopwords

objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if token not in stopwords])
objectives_split.head(2)

0                                       [spot, tation]
1    [formation, occurrence, nitrous, acd, atmosphe...
Name: merged, dtype: object

In [8]:
frequency = defaultdict(int)
for text in objectives_split:
    for token in text:
        frequency[token] += 1

objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if (frequency[token] > 5)])

objectives_dictionary = Dictionary(objectives_split)

In [9]:
class ObjectivesCorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return iter(self.input)
    def __calc_corpus_size__(self):
        logging.info('Calculating corpus size')
        self.length = 0
        self.num_words = 0
        for doc in self.get_texts():
            self.length += 1
            self.num_words += len(doc)
    def __len__(self):
        """Define this so we can use `len(corpus)`"""
        if 'length' not in self.__dict__:
            self.__calc_corpus_size__()
        return self.length
    def __str__(self):
        if 'num_words' not in self.__dict__:
            self.__calc_corpus_size__()
        return (str(self.length) + ' documents, ' + str(self.num_words)
                + ' words')
            

objectives_corpus = ObjectivesCorpus(objectives_split)

In [10]:
lda = gensim.models.ldamodel.LdaModel(corpus = objectives_corpus, 
                                        id2word = objectives_dictionary, 
                                        num_topics = 10,
                                        passes=2,
                                        random_state = np.random.seed(12),
                                        iterations = 7000)

In [11]:
lda.show_topics()

[(0,
  '0.024*"energy" + 0.014*"systems" + 0.011*"power" + 0.011*"control" + 0.010*"low" + 0.009*"technology" + 0.008*"processing" + 0.008*"design" + 0.008*"integrated" + 0.007*"heat"'),
 (1,
  '0.022*"fish" + 0.014*"cell" + 0.013*"disease" + 0.011*"study" + 0.010*"role" + 0.009*"protein" + 0.008*"growth" + 0.008*"diseases" + 0.008*"cells" + 0.008*"molecular"'),
 (2,
  '0.010*"test" + 0.009*"methods" + 0.007*"testing" + 0.007*"products" + 0.007*"method" + 0.007*"design" + 0.006*"industrial" + 0.006*"materials" + 0.006*"control" + 0.006*"tests"'),
 (3,
  '0.021*"high" + 0.021*"water" + 0.020*"production" + 0.012*"chemical" + 0.012*"food" + 0.010*"materials" + 0.010*"control" + 0.010*"solar" + 0.010*"optical" + 0.009*"quality"'),
 (4,
  '0.027*"food" + 0.017*"health" + 0.012*"industry" + 0.010*"action" + 0.010*"packaging" + 0.009*"technology" + 0.008*"countries" + 0.008*"network" + 0.008*"concerted" + 0.007*"wood"'),
 (5,
  '0.013*"models" + 0.010*"analysis" + 0.008*"innovation" + 0.008*

In [18]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [19]:

pyLDAvis.gensim.prepare(lda, objectives_corpus, objectives_dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
