In [2]:
"""
Create LDA model using of DUC-2004 task 2 dataset
"""
import pickle
import LDA_extractor

# Load pickle containing DUC corpus
corpus = {}
with open('corpus.pkl', 'rb') as f:
        corpus = pickle.load(f)

# Create list of every article to train lda on
articles = []
for set_id in corpus.keys():
    articles = articles + corpus[set_id]['articles']

In [2]:
# Train LDA model on articles
parser = LDA_extractor.LDA_parser(articles,
                                  language='english', 
                                  preprocessor_type='spacy',
                                  num_topics = 32, 
                                  passes = 100,
                                  custom_filter = ['said'])

parser.print_topics(words_per_topic = 10) 
topic_mixtures = parser.extract_topics(max_words_per_topic=50, threshold=0.005)
print(topic_mixtures)

# extract topics as a dictionary
topics = parser.extract_topic_words(max_words_per_topic=50, threshold=0.005)
print(topics)

Initializing model...

spaCy preprocessor selected.
Fitting LDA topic modelling...
Preprocessing corpus...
Creating corpora dictionary...
Translating doc2bow corpus...
Running LDA...

Done in 406.611 seconds.
(4, '0.023*"said" + 0.010*"house" + 0.010*"republicans" + 0.009*"republican" + 0.008*"clinton" + 0.008*"people" + 0.008*"would" + 0.007*"gingrich" + 0.007*"president" + 0.006*"livingston"')
(1, '0.015*"mail" + 0.012*"post" + 0.011*"office" + 0.010*"prize" + 0.008*"railroads" + 0.008*"academy" + 0.008*"nobel" + 0.007*"letters" + 0.005*"word" + 0.005*"carry"')
(27, '0.018*"microsoft" + 0.009*"prize" + 0.008*"court" + 0.008*"case" + 0.007*"says" + 0.007*"courts" + 0.007*"antitrust" + 0.006*"justice" + 0.006*"consumers" + 0.006*"windows"')
(22, '0.017*"hun" + 0.016*"sen" + 0.014*"said" + 0.010*"party" + 0.009*"government" + 0.008*"ranariddh" + 0.008*"opposition" + 0.006*"rainsy" + 0.006*"sam" + 0.006*"new"')
(5, '0.026*"said" + 0.015*"kosovo" + 0.010*"milosevic" + 0.010*"president" + 

{0: ['said', 'plant', 'anwar', 'internet'], 1: ['mail', 'post', 'office', 'prize', 'railroads', 'academy', 'nobel', 'letters', 'word', 'carry', 'rubles'], 2: ['government', 'party', 'prodi', 'would', 'vote', 'parliament', 'coalition', 'center', 'ecevit', 'left', 'italy', 'communist', 'said', 'yilmaz', 'support', 'premier', 'new', 'confidence', 'political', 'majority', 'right', 'minister', 'form', 'elections', 'could', 'wednesday', 'leader', 'he', 'president', 'last'], 3: ['bin', 'laden', 'said', 'officials', 'terrorist', 'east', 'united', 'afghanistan', 'states', 'evidence', 'government'], 4: ['said', 'house', 'republicans', 'republican', 'clinton', 'people', 'would', 'gingrich', 'president', 'livingston', 'party', 'but', 'fire', 'impeachment', 'speaker'], 5: ['said', 'kosovo', 'milosevic', 'president', 'nato', 'yugoslavia', 'albanian', 'army', 'yugoslav', 'holbrooke', 'military', 'forces', 'province', 'serbian'], 6: ['series', 'yankees', 'game', 'world', 'martinez', 'right', 'knoblauc

In [None]:
# Train LDA model on articles
parser100 = LDA_extractor.LDA_parser(articles,
                                  language='english', 
                                  preprocessor_type='spacy',
                                  num_topics = 100, 
                                  passes = 100,
                                  custom_filter = ['said'])

parser100.print_topics(words_per_topic = 20)
topic_mixtures = parser100.extract_topics(max_words_per_topic=50, threshold=0.005)
print(topic_mixtures)

# extract topics as a dictionary
topics = parser100.extract_topic_words(max_words_per_topic=50, threshold=0.005)
print(topics)

Initializing model...

spaCy preprocessor selected.
Fitting LDA topic modelling...
Preprocessing corpus...
Creating corpora dictionary...
Translating doc2bow corpus...
Running LDA...


In [4]:
import centroid_bow
import centroid_word_embeddings
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile

## Evaluate using Word Embeddings Trained on Common Texts (dim = 100)

In [34]:
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=-1) 
summarizer = centroid_word_embeddings.CentroidWordEmbeddingsSummarizer(model, parser = parser)

In [35]:
SET_ID = 0
compound_article = ''.join(corpus[list(corpus.keys())[SET_ID]]['articles'])
print(summarizer.summarize(compound_article))

[(22, 0.7596901), (5, 0.06407207), (4, 0.04172601), (30, 0.037221324), (11, 0.03537914), (16, 0.022312215), (14, 0.020064304)]
Cambodian leader Hun Sen on Friday rejected opposition parties' demands for talks outside the country, accusing them of trying to ``internationalize'' the political crisis.
Government and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen's party to form a new government failed.
Opposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing.
Hun Sen, however, rejected that.


In [36]:
import rouge_evaluator3
hypos = []
refs = []
for set_id in corpus.keys():
    hypos.append(summarizer.summarize(''.join(corpus[set_id]['articles'])))
    refs.append(corpus[set_id]['summaries'])
print(rouge_evaluator3.evaluate_hypotheses(hypos, refs))

[(22, 0.75968915), (5, 0.06407024), (4, 0.041724127), (30, 0.03721896), (11, 0.035378315), (16, 0.022312785), (14, 0.020062493)]
[(19, 0.49437243), (28, 0.218518), (6, 0.13177569), (16, 0.055854514), (14, 0.028849889), (17, 0.02671278), (9, 0.022486668)]
[(13, 0.3844263), (26, 0.34031698), (30, 0.08275041), (3, 0.06405593), (7, 0.04887628), (11, 0.046088208), (0, 0.013567894)]
[(3, 0.77016616), (9, 0.051156893), (19, 0.04287122), (28, 0.03973539), (26, 0.03687236), (10, 0.023494856), (21, 0.015726412), (5, 0.013755593)]
[(17, 0.55765146), (13, 0.31476828), (22, 0.036934745), (14, 0.036477376), (30, 0.02789569), (26, 0.014541828)]
[(16, 0.77826595), (21, 0.105469756), (12, 0.042036407), (17, 0.026896121), (28, 0.022563789), (5, 0.012377378)]
[(19, 0.7661774), (17, 0.07073686), (30, 0.026421214), (22, 0.02298948), (5, 0.020694094), (24, 0.019517487), (0, 0.016680181), (16, 0.015344771), (4, 0.014054547), (14, 0.012249647)]
[(16, 0.6653408), (28, 0.2472496), (23, 0.07041405)]
[(19, 0.6078

		metric:	P: 28.32	R: 27.19	F1: 27.74
	Hypothesis #9 & Reference #3: 
		metric:	P: 23.95	R: 23.37	F1: 23.66
	Hypothesis #10 & Reference #0: 
		metric:	P: 30.24	R: 30.73	F1: 30.49
	Hypothesis #10 & Reference #1: 
		metric:	P: 24.06	R: 24.65	F1: 24.35
	Hypothesis #10 & Reference #2: 
		metric:	P: 23.00	R: 23.37	F1: 23.18
	Hypothesis #10 & Reference #3: 
		metric:	P: 25.11	R: 26.15	F1: 25.62
	Hypothesis #11 & Reference #0: 
		metric:	P: 26.79	R: 26.58	F1: 26.68
	Hypothesis #11 & Reference #1: 
		metric:	P: 30.98	R: 31.50	F1: 31.24
	Hypothesis #11 & Reference #2: 
		metric:	P: 21.36	R: 21.02	F1: 21.19
	Hypothesis #11 & Reference #3: 
		metric:	P: 24.65	R: 24.25	F1: 24.45
	Hypothesis #12 & Reference #0: 
		metric:	P: 28.44	R: 29.14	F1: 28.79
	Hypothesis #12 & Reference #1: 
		metric:	P: 34.50	R: 35.06	F1: 34.78
	Hypothesis #12 & Reference #2: 
		metric:	P: 22.11	R: 22.65	F1: 22.38
	Hypothesis #12 & Reference #3: 
		metric:	P: 16.54	R: 16.67	F1: 16.60
	Hypothesis #13 & Reference #0: 
		metri

## Evaluate using Word Embeddings Trained on the Corpus (dim = 300)

In [24]:
# Sentence tokenize t
import nltk.data
import preprocessor3

# Compound all docs into a megadocument to process
megadoc = ""
for set_id in corpus:
    for article in corpus[set_id]['articles']:
        megadoc += article
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
pre = preprocessor3.spacy_preprocessor()
megadoc_sents = sent_detector.tokenize(megadoc)
embeddings_corpus = pre.preprocess_texts(megadoc_sents, tags=[])

In [29]:
model = Word2Vec(embeddings_corpus, size=300, window=5, min_count=1, workers=-1) 
summarizer = centroid_word_embeddings.CentroidWordEmbeddingsSummarizer(model, parser = parser)

In [30]:
SET_ID = 0
compound_article = ''.join(corpus[list(corpus.keys())[SET_ID]]['articles'])
print(summarizer.summarize(compound_article))

[(22, 0.75969136), (5, 0.06407431), (4, 0.04172836), (30, 0.037224017), (11, 0.035380177), (16, 0.022310968), (14, 0.02006666)]
Cambodian leader Hun Sen on Friday rejected opposition parties' demands for talks outside the country, accusing them of trying to ``internationalize'' the political crisis.
Government and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen's party to form a new government failed.
Opposition leaders Prince Norodom Ranariddh and Sam Rainsy, citing Hun Sen's threats to arrest opposition figures after two alleged attempts on his life, said they could not negotiate freely in Cambodia and called for talks at Sihanouk's residence in Beijing.
Hun Sen, however, rejected that.


In [33]:
import rouge_evaluator3
hypos = []
refs = []
for set_id in corpus.keys():
    hypos.append(summarizer.summarize(''.join(corpus[set_id]['articles'])))
    refs.append(corpus[set_id]['summaries'])
print(rouge_evaluator3.evaluate_hypotheses(hypos, refs))

[(22, 0.759689), (5, 0.06407002), (4, 0.04172391), (30, 0.037218664), (11, 0.03537825), (16, 0.022312341), (14, 0.020062424)]
[(19, 0.4943709), (28, 0.21851768), (6, 0.13177583), (16, 0.055854294), (14, 0.028849978), (17, 0.026714142), (9, 0.022487836)]
[(13, 0.38443038), (26, 0.3403188), (30, 0.08275759), (3, 0.0640601), (7, 0.048878744), (11, 0.04609058), (0, 0.013568831)]
[(3, 0.7701665), (9, 0.05115599), (19, 0.04287007), (28, 0.0397353), (26, 0.036871504), (10, 0.023494737), (21, 0.015730012), (5, 0.0137548605)]
[(17, 0.5576531), (13, 0.31476846), (22, 0.036934968), (14, 0.036477823), (30, 0.027896233), (26, 0.01454247)]
[(16, 0.77831584), (21, 0.10548456), (12, 0.04205644), (17, 0.0268977), (28, 0.022566233), (5, 0.012414201)]
[(19, 0.7661748), (17, 0.070735894), (30, 0.02642394), (22, 0.022989243), (5, 0.020693133), (24, 0.019517612), (0, 0.016679455), (16, 0.015349585), (4, 0.014056343), (14, 0.012249204)]
[(16, 0.66534144), (28, 0.24725011), (23, 0.07041433)]
[(19, 0.6078118),

		metric:	P: 26.58	R: 27.24	F1: 26.90
	Hypothesis #21 & Reference #0: 
		metric:	P: 23.76	R: 23.95	F1: 23.86
	Hypothesis #21 & Reference #1: 
		metric:	P: 25.94	R: 25.73	F1: 25.83
	Hypothesis #21 & Reference #2: 
		metric:	P: 23.76	R: 23.00	F1: 23.37
	Hypothesis #21 & Reference #3: 
		metric:	P: 22.65	R: 22.65	F1: 22.65
	Hypothesis #22 & Reference #0: 
		metric:	P: 33.24	R: 33.78	F1: 33.50
	Hypothesis #22 & Reference #1: 
		metric:	P: 30.24	R: 30.98	F1: 30.61
	Hypothesis #22 & Reference #2: 
		metric:	P: 30.24	R: 30.73	F1: 30.49
	Hypothesis #22 & Reference #3: 
		metric:	P: 30.24	R: 31.50	F1: 30.86
	Hypothesis #23 & Reference #0: 
		metric:	P: 31.50	R: 30.73	F1: 31.11
	Hypothesis #23 & Reference #1: 
		metric:	P: 31.50	R: 31.50	F1: 31.50
	Hypothesis #23 & Reference #2: 
		metric:	P: 32.54	R: 31.00	F1: 31.75
	Hypothesis #23 & Reference #3: 
		metric:	P: 37.68	R: 37.07	F1: 37.37
	Hypothesis #24 & Reference #0: 
		metric:	P: 33.58	R: 33.58	F1: 33.58
	Hypothesis #24 & Reference #1: 
		metr