In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import gensim

from wordcloud import WordCloud
from gensim import corpora
from gensim import models
from gensim.corpora.dictionary import Dictionary

from collections import defaultdict
from gensim.parsing.preprocessing import STOPWORDS

In [2]:
import string
import logging

In [3]:
dfname = 'FP4'

df1 = pd.read_pickle('dfs/' + dfname)
df = df1[['title','objective']]
df = df.dropna(how='any')
df['merged'] = df['title'] + ' ' + df['objective']

In [4]:
objectives = df['merged']
# objectives = pd.concat([ df['objective'] for df in [df4, df5, df6, df7, df20] ])

In [5]:
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])

objectives = objectives.str.lower().str.replace('%l', '').str.replace(RE_PUNCTUATION, ' ')

In [6]:
objectives_split = objectives.str.strip().str.split()
objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if len(token) > 2])
objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if not(token.isdigit())])
objectives_split.head(2)

0                                       [spot, tation]
1    [formation, and, occurrence, nitrous, acd, the...
Name: merged, dtype: object

In [8]:
list_stopwords = ['data','will', 'develop', 'development', 'project', 'research', 'new', 'use', 'europe', 'european', 'based']
if dfname == 'FP4':
    list_stopwords.append('des')

additional_stopwords = set(list_stopwords)
stopwords = set(STOPWORDS) | additional_stopwords

objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if token not in stopwords])
objectives_split.head(2)

0                                       [spot, tation]
1    [formation, occurrence, nitrous, acd, atmosphe...
Name: merged, dtype: object

In [9]:
frequency = defaultdict(int)
for text in objectives_split:
    for token in text:
        frequency[token] += 1

objectives_split = objectives_split.apply(lambda tokens: [token for token in tokens if (frequency[token] > 5)])

objectives_dictionary = Dictionary(objectives_split)

In [10]:
class ObjectivesCorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return iter(self.input)
    def __calc_corpus_size__(self):
        logging.info('Calculating corpus size')
        self.length = 0
        self.num_words = 0
        for doc in self.get_texts():
            self.length += 1
            self.num_words += len(doc)
    def __len__(self):
        """Define this so we can use `len(corpus)`"""
        if 'length' not in self.__dict__:
            self.__calc_corpus_size__()
        return self.length
    def __str__(self):
        if 'num_words' not in self.__dict__:
            self.__calc_corpus_size__()
        return (str(self.length) + ' documents, ' + str(self.num_words)
                + ' words')
            

objectives_corpus = ObjectivesCorpus(objectives_split)

In [11]:
lda = gensim.models.ldamodel.LdaModel(corpus = objectives_corpus, 
                                        id2word = objectives_dictionary, 
                                        num_topics = 10,
                                        passes=2,
                                        random_state = np.random.seed(12),
                                        iterations = 7000)

In [12]:
lda.show_topics()

[(0,
  u'0.016*"quality" + 0.013*"software" + 0.013*"process" + 0.010*"design" + 0.010*"systems" + 0.009*"control" + 0.008*"management" + 0.007*"improvement" + 0.007*"cost" + 0.006*"test"'),
 (1,
  u'0.013*"fish" + 0.013*"water" + 0.010*"plant" + 0.009*"species" + 0.009*"production" + 0.009*"chemical" + 0.008*"food" + 0.007*"biological" + 0.006*"control" + 0.006*"plants"'),
 (2,
  u'0.010*"study" + 0.010*"cell" + 0.009*"molecular" + 0.008*"genetic" + 0.008*"aquaculture" + 0.007*"control" + 0.007*"wind" + 0.007*"role" + 0.007*"cells" + 0.006*"neutron"'),
 (3,
  u'0.021*"information" + 0.011*"transport" + 0.009*"network" + 0.009*"health" + 0.008*"services" + 0.007*"support" + 0.007*"results" + 0.007*"public" + 0.006*"countries" + 0.006*"programme"'),
 (4,
  u'0.010*"concerted" + 0.010*"systems" + 0.010*"satellite" + 0.008*"building" + 0.007*"atm" + 0.006*"fisheries" + 0.006*"integrated" + 0.006*"farm" + 0.006*"digital" + 0.006*"video"'),
 (5,
  u'0.020*"high" + 0.010*"power" + 0.010*"ene

In [13]:
from __future__ import division
import graphlab as gl
import pyLDAvis
import pyLDAvis.graphlab



This non-commercial license of GraphLab Create for academic use is assigned to iliadi15@aueb.gr and will expire on June 01, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1496321908.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1496321908.log


In [14]:
pyLDAvis.enable_notebook()

In [15]:
import pyLDAvis.gensim

pyLDAvis.gensim.prepare(lda, objectives_corpus, objectives_dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
