In [199]:
import collections
import itertools
import json
import pathlib
from pprint import pprint
import string

import gensim as gs
import numpy as np
import pandas as pd

from nltk import tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def expand_contractions(text):
    pass
    # *can't --> cannot
    # shouldn't, hasn't, wouldn't --> (...)n't --> (...) not
    # I'd, he'd, we'd --> (...)'d --> (...) would
    # should've --> (...)'ve --> (...) have

In [3]:
def summarize(text, top, *, vectorizer=TfidfVectorizer, stop_words=None, metric=cosine_similarity):
    sentences = tokenize.sent_tokenize(text)
    corpus = sentences + [text]
    vector = vectorizer(stop_words=stop_words).fit_transform(corpus)
    
    sent_slice = slice(0, -1)
    text_index = -1
    measures = metric(vector[sent_slice], vector[text_index])
    meas_sent = zip(measures, sentences)
    top = sorted(meas_sent, reverse=True)[:top]
    top_sents = [s for _, s in top]
    return top_sents

In [18]:
def id2token(dctnry, bow):
    id2token = {v: k for k, v in dctnry.token2id.items()}
    return [(id2token[id], token) for id, token in bow]

In [210]:
year = str(2010)
jsons = pathlib.Path.cwd() / 'data' / 'jsons'
json_year = jsons / year / f'{year}.json'

with open(json_year) as fp:
    dict_year = json.load(fp)[year]
    
article = dict_year[42]

In [211]:
story = article['story']
summary = article['summary'].strip()
summary

'At a Friday Mass in St. Peter\'s Square, Pope Benedict XVI promised to "do everything possible" to ensure that prelates don\'t rape or molest children ever again. But the statement did not satisfy abuse victims groups, who are demanding more accountability from the church.'

In [212]:
text = story
tokenizer = tokenize.word_tokenize
stop_words = set(stopwords.words('english'))
# stop_words.update(string.punctuation)

def seive(word):
    return (word not in stop_words) and (len(word) > 2) and (not word.startswith("'"))

summary_sentences = tokenize.sent_tokenize(summary.lower())
summary_tokens = [[word for word in tokenizer(sent) if seive(word)] 
                  for sent in summary_sentences]

story_paragraphs = story.lower().split('\n\n')
story_tokens = [[word for word in tokenizer(para) if seive(word)]
                for para in story_paragraphs]

####: for some reason the tfidf only really works with a corpus of more than 1 item
# story_paragraphs = story.lower().replace('\n\n', ' ')
# story_tokens = [[word for word in tokenizer(story_paragraphs) if word not in stop_words]]

story_dctnry = gs.corpora.Dictionary(story_tokens)
corpus = [story_dctnry.doc2bow(tkn) for tkn in story_tokens]

tfidf = gs.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

summary_bows = [story_dctnry.doc2bow(doc) for doc in summary_tokens]
summary_tfidf = tfidf[summary_bows]

In [213]:
lsi = gs.models.LsiModel(corpus_tfidf, id2word=story_dctnry, num_topics=2)  # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

In [214]:
lsi.print_topics()

[(0,
  '0.182*"ensure" + 0.182*"never" + 0.182*"possible" + 0.182*"god" + 0.182*"forgiveness" + 0.176*"everything" + 0.160*"would" + 0.159*"friday" + 0.156*"pope" + 0.154*"victims"'),
 (1,
  '-0.204*"forgiveness" + -0.204*"ensure" + -0.204*"god" + -0.204*"possible" + -0.204*"never" + -0.143*"everything" + -0.142*"persons\u202c\u202a" + -0.142*"insistently" + -0.142*"beg" + -0.142*"promising"')]

In [215]:
for doc in corpus_lsi:
    print(doc)

[(0, 0.6433836567612338), (1, -0.5052212565696722)]
[(0, 0.1824919777719882), (1, 0.30567437424129956)]
[(0, 0.5072185668985268), (1, -0.5535796124332549)]
[(0, 0.22268031759769885), (1, 0.04317398866614782)]
[(0, 0.04389431777448442), (1, 0.05872455372950356)]
[(0, 0.30315009442203816), (1, 0.12612195596786677)]
[(0, 0.11230905682917246), (1, 0.10287799448817823)]
[(0, 0.3428165896320907), (1, 0.25162391299271125)]
[(0, 0.06439901934768232), (1, 0.11709307535469478)]
[(0, 0.2869920179218724), (1, 0.08707998284920253)]
[(0, 0.18691215027346766), (1, 0.09263830573394308)]
[(0, 0.08725831381098292), (1, 0.10767307479391704)]
[(0, 0.4029760703401056), (1, 0.011373695721479046)]
[(0, 0.27397722133430863), (1, 0.39095808050115816)]
[(0, 0.19345323825753047), (1, 0.21977130326694871)]
[(0, 0.14922520360196664), (1, 0.31135968775834993)]
[(0, 0.10698169905721162), (1, 0.2193544763036076)]
[(0, 0.3398726175258859), (1, 0.34845034471579917)]


In [216]:
id2token(story_dctnry, summary_bows[0])

[('benedict', 1),
 ('ensure', 1),
 ('everything', 1),
 ('friday', 1),
 ('pope', 1),
 ('possible', 1),
 ('xvi', 1),
 ('mass', 1),
 ('peter', 1),
 ('square', 1),
 ('st.', 1)]

In [217]:
id2token(story_dctnry, summary_bows[1])

[('victims', 1),
 ('abuse', 1),
 ('accountability', 1),
 ('demanding', 1),
 ('satisfy', 1),
 ('statement', 1),
 ('church', 1)]

In [231]:
summ = gs.summarization.summarize(story, split=False, word_count=50, ratio=None)
print(summ)

But he also said that in this year of joy, "the sins of priests came to light -- particularly the abuse of little ones."‬‪‬‪ Benedict said that from now on, the Roman Catholic Church will do everything it can to screen men seeking to join the priesthood to determine the authenticity of their vocation.‬‪‬‪ The pledges were similar to ones he reportedly made in a private encounter with abuse victims in Malta in April.‬‪


In [219]:
summary

'At a Friday Mass in St. Peter\'s Square, Pope Benedict XVI promised to "do everything possible" to ensure that prelates don\'t rape or molest children ever again. But the statement did not satisfy abuse victims groups, who are demanding more accountability from the church.'

In [220]:
len(summary.split())

44

In [244]:
kywrds = gs.summarization.keywords(story, split=True, ratio=None, words=3, lemmatize=True)
kywrds

['benedict', 'abused', 'priest']

In [245]:
len(story)

3914