In [1]:
from collections import defaultdict
import pandas as pd
import gzip

In [2]:
CONGOV_LDA = 'tw_topicos_lda_15.txt.gz'
CONGOV_LSI = 'tw_topicos_lsi_15.txt.gz'

PROGOV_LDA = 'tw_topicos_lda_13.txt.gz'
PROGOV_LSI = 'tw_topicos_lsi_13.txt.gz'

In [71]:
def ExtractTopics(filename):
    topic = set()
    f = gzip.open(filename)
    for l in f:
        t = l.split(' $&% ')[-1].strip()
        if '*' in t:
            topic.add(t)
    f.close()
    return topic

def TopWords(topic,thr):
    words = defaultdict(float)
    for t in topic:
        for word in t.split('+'):
            prob, w = word.split('*')
            prob = float(prob)
            if words[w] < prob:
                words[w] = prob
    topics = sorted(words.items(), key=lambda x: float(x[1]), reverse=True)
    return map(lambda x: '{0} ({1})'.format(x[0].replace('"',''),x[1]), topics[:thr])

In [75]:
progov = {"PLSI": TopWords(ExtractTopics(PROGOV_LSI),10), 
          "LDA": TopWords(ExtractTopics(PROGOV_LDA),10)}

congov = {"PLSI": TopWords(ExtractTopics(CONGOV_LSI),10), 
          "LDA": TopWords(ExtractTopics(CONGOV_LDA),10)}

In [76]:
progov_df = pd.DataFrame(progov)
congov_df = pd.DataFrame(congov)

In [79]:
print progov_df.to_latex()
print congov_df.to_latex()

\begin{tabular}{lll}
\toprule
{} &                             LDA &                            PLSI \\
\midrule
0 &          globogolpista  (0.293) &         globoquergolpe  (0.824) \\
1 &         dia13diadeluta  (0.246) &  domingoeunaovouporque  (0.569) \\
2 &  domingoeunaovouporque  (0.184) &          dia13diadeluta  (0.54) \\
3 &                  dilma  (0.171) &     vemprarua15demarco  (0.532) \\
4 &              foradilma  (0.144) &                 defesa  (0.478) \\
5 &                   povo  (0.138) &          globogolpista  (0.453) \\
6 &     vemprarua15demarco  (0.138) &                    dia  (0.392) \\
7 &                     pt  (0.136) &                 apoiou  (0.367) \\
8 &         globoquergolpe  (0.135) &                 sonega  (0.367) \\
9 &                  vamos  (0.129) &               boicotou  (0.366) \\
\bottomrule
\end{tabular}

\begin{tabular}{lll}
\toprule
{} &                              LDA &                           PLSI \\
\midrule
0 &              